抓取html
package com.neusoft.mid.parser;import java.io.IOException;import java.text.ParseException;import java.text.SimpleDateFormat;import java.util.Date;import org.apache.commons.httpclient.DefaultHttpMethodRetryHandler;import org.apache.commons.httpclient.HostConfiguration;import org.apache.commons.httpclient.HttpClient;import org.apache.commons.httpclient.HttpException;import org.apache.commons.httpclient.HttpStatus;import org.apache.commons.httpclient.NTCredentials;import org.apache.commons.httpclient.auth.AuthScope;import org.apache.commons.httpclient.methods.GetMethod;import org.apache.commons.httpclient.params.HttpMethodParams;import org.apache.log4j.Logger;import org.htmlparser.Node;import org.htmlparser.Parser;import org.htmlparser.Tag;import org.htmlparser.tags.LinkTag;import org.htmlparser.tags.Span;import org.htmlparser.util.NodeList;import org.htmlparser.util.ParserException;import org.htmlparser.visitors.HtmlPage;import org.htmlparser.visitors.NodeVisitor;import com.neusoft.mid.parser.bean.ContentBean;public class CatchContentimplements Runnable{private static final Logger logger = Logger.getLogger(CatchContent.class);final static boolean IS_INIT=false;//true问第一次使用 false为初始化之后使用 private DoSql doSql; private String type; private String url; private String maxDate;public DoSql getDoSql() {return doSql;}public void setDoSql(DoSql doSql) {this.doSql = doSql;}/*** 字符串转换成日期* @param str* @return date*/public static Date StrToDate(String str) { SimpleDateFormat format = new SimpleDateFormat("yyyy-MM-dd"); Date date = null; try { date = format.parse(str); } catch (ParseException e) { logger.error(e); } return date;}/*** 判断日期大小*/ public boolean isLate(String createdate){if(this.maxDate ==null){return false;} if(StrToDate(createdate).getTime()<StrToDate(this.maxDate).getTime()){logger.info("日期小于数据库中最大日期,不予录入");return true; }return false;} publicvoid initInsertDB(String name,String createdate,String content){ ContentBean cbean = new ContentBean(); cbean.setName(name); cbean.setContent(content); cbean.setCreatedate(createdate); cbean.setType(type); try {doSql.insertData(cbean);} catch (Exception e) {// TODO Auto-generated catch blocke.printStackTrace();} } publicvoid insertDB(String name,String createdate,String content){ if(isLate(createdate)){ logger.info("日期小于数据库中最大日期,不予录入"); }else{ ContentBean cbean = new ContentBean(); cbean.setName(name); cbean.setContent(content); cbean.setCreatedate(createdate); cbean.setType(type); try { doSql.insertData(cbean); } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } } }//分别读纯文本和链接 public static String[] readDateAndLink(String result) throws Exception { Parser myParser; // Parser parser; myParser = Parser.createParser(result, "GB2312"); HtmlPage htmlPage = new HtmlPage(myParser); myParser.visitAllNodesWith(htmlPage); NodeList nodelist ; nodelist = htmlPage.getBody(); int size= nodelist.size(); Node[] nodes = nodelist.toNodeArray(); StringBuffer sb = new StringBuffer(); for(int i =0;i<size;i++){ sb.append(nodes.toHtml().trim()); } /** * 获取url */ Parser parser = new Parser(sb.toString()); parser.setEncoding("GB2312"); // NodeList list = parser.extractAllNodesThatMatch(filter); final StringBuffer spanBuffer = new StringBuffer(); final StringBuffer titleBuffer = new StringBuffer(); final StringBuffer urlBuffer = new StringBuffer(); NodeVisitor visitor = new NodeVisitor() { public void visitTag(Tag tag) { if (tag instanceof Span) { spanBuffer.append(tag.getChildren().asString()); spanBuffer.append("`"); } else if (tag instanceof LinkTag) { urlBuffer.append(tag.getAttribute("href")); urlBuffer.append("`"); titleBuffer.append(tag.getAttribute("title")); titleBuffer.append("`"); } else if (tag instanceof org.htmlparser.tags.ParagraphTag ) { System.out.println( spanBuffer.append(tag.getChildren().asString())); } } }; parser.visitAllNodesWith(visitor); String allContent[] = {spanBuffer.toString(),urlBuffer.toString(),titleBuffer.toString()}; return allContent; } /** * 获得代理的httpClient * @return */ public static HttpClient getHttpClient(){ //构造HttpClientHttpClient httpClient = new HttpClient(); httpClient.getHostConfiguration().setProxy("192.168.107.28",8080);NTCredentials defaultcreds = new NTCredentials("帐号", "密码", "192.168.107.28", "hold");httpClient.getState().setProxyCredentials(AuthScope.ANY, defaultcreds); HostConfiguration hcf =new HostConfiguration(); hcf.setProxy("192.168.107.28",8080); return httpClient; } publicvoid save(String[] result){ //链接字符串 String url = result; //标题字符串 String title = result;//日期字符串 String date = result; String[] urlArr = url.split("`"); String[] titleArr = title.split("`"); String[] dateArr = date.split("`"); //判断是否为初始化第一次操作 if(IS_INIT){ for(int i = 0 ; i< urlArr.length;i++){ String _url = urlArr; String _title = titleArr; String _date = dateArr; String _content= getContent(_url); initInsertDB(_title,_date,_content); } }else{ for(int i = 0 ; i< urlArr.length;i++){ String _url = urlArr; String _title = titleArr; String _date = dateArr; String _content= getContent(_url); insertDB(_title,_date,_content); } } } public static String getContent(String _url) { HttpClient httpClient = getHttpClient(); //System.out.println("http://www.szeb.edu.cn/"+_url);//创建GET方法GetMethod getMethod = new GetMethod("http://www.szeb.edu.cn/"+_url); //使用系统提供的默认的恢复策略 getMethod.getParams().setParameter(HttpMethodParams.RETRY_HANDLER, new DefaultHttpMethodRetryHandler()); try { //执行getMethod int statusCode = httpClient.executeMethod(getMethod); if (statusCode != HttpStatus.SC_OK) { // System.err.println("抓取失败: " + getMethod.getStatusLine()); } //读取内容 byte[] responseBody = getMethod.getResponseBody(); String result= readContent(new String(responseBody)); return result;} catch (HttpException e) { //发生致命的异常,可能是协议不对或者返回的内容有问题 logger.error(e);} catch (IOException e) { //发生网络异常 e.printStackTrace(); } catch (Exception e) {logger.error(e);} finally { //释放 getMethod.releaseConnection(); } return null; } /** * 获得内容 * @param url * @return */ private static String readContent(String string) { Parser myParser; // Parser parser; myParser = Parser.createParser(string, "GB2312"); HtmlPage htmlPage = new HtmlPage(myParser); try {myParser.visitAllNodesWith(htmlPage);} catch (ParserException e) {logger.error(e);} //获得标题 // String textInPage = htmlPage.getTitle(); //System.out.println(textInPage); NodeList nodelist ; nodelist = htmlPage.getBody(); int size= nodelist.size(); Node[] nodes = nodelist.toNodeArray(); StringBuffer sb = new StringBuffer(); for(int i =5;i<size;i++){ // System.out.println(i+"@@@@@@@@@@@@@@@@@@@@"+nodes.toHtml().trim()); //获得内容 // System.out.println(i+"@@@@@@@@@@@@@@@@@@@@"+nodes.toPlainTextString()); //取得连接页面 sb.append(nodes.toPlainTextString()); } return sb.toString(); }/** * 获得连接以及标题 日期 * @param url * @return */ publicvoid getUrlAndTitle(String url){ HttpClient httpClient = getHttpClient(); //创建GET方法GetMethod getMethod = new GetMethod(url); //使用系统提供的默认的恢复策略 getMethod.getParams().setParameter(HttpMethodParams.RETRY_HANDLER, new DefaultHttpMethodRetryHandler()); try { //执行getMethod logger.info("#############开始抓取网页源代码###################"); int statusCode = httpClient.executeMethod(getMethod); if (statusCode != HttpStatus.SC_OK) { System.err.println("抓取失败: " + getMethod.getStatusLine()); } //读取内容 byte[] responseBody = getMethod.getResponseBody(); String result[]= readDateAndLink(new String(responseBody)); save(result);} catch (HttpException e) { //发生致命的异常,可能是协议不对或者返回的内容有问题 // System.out.println("没有连上网络"); logger.error(e); } catch (IOException e) { //发生网络异常 logger.error(e); } catch (Exception e) {logger.error(e);} finally { logger.info("#############抓取网页源代码结束###################");//System.out.println("#############抓取网页源代码结束###################"); //释放 getMethod.releaseConnection(); } } //main.jsp?start=0&PageCount=100000&totalnum=156&colid=504public void run() {getUrlAndTitle(this.url);}public String getType() {return type;}public void setType(String type) {this.type = type;}public String getUrl() {return url;}public void setUrl(String url) {this.url = url;}public String getMaxDate() {return maxDate;}public void setMaxDate(String maxDate) {this.maxDate = maxDate;}}
页:
[1]