基于jsoup的网页爬虫
前阵子做了个网页抓取工具,可扩展性较差,今天发现google 的一个开源网页抓取工具jsoup,写了个测试,与大家分享下package com.gump.net.html.test;import java.io.IOException;import org.jsoup.Jsoup;import org.jsoup.nodes.Document;import org.jsoup.nodes.Element;import org.jsoup.select.Elements;/** *测试类 *用jasoup进行html具体的网页解析例子@author ganliang13{@link http://ganliang13.iteye.com/} * */public class test {public static void main(String[] args) throws IOException{long begin = System.currentTimeMillis();//整个html内容Document doc = Jsoup.connect("http://www.qzone.cc/Gexing/Qian/02/26263.html").timeout(30000).get(); // 设置连接超时时间 //打印html文档的<title>内容System.out.println(doc.getElementsByTag("title")); //打印html文档的<a>内容Elements aels = doc.getElementsByTag("a");for (Element el : aels) {System.out.println(el.toString());}long end = System.currentTimeMillis();System.out.println(end-begin);}}
页:
[1]