一個(gè)簡(jiǎn)單的java網(wǎng)絡(luò)爬蟲,由于時(shí)間原因,沒有進(jìn)一步解釋.
需要的htmlparser.jar包到官方網(wǎng)上去下.
- ---------------------------------------------Spider.java-----------------------------------------------------------------
- import java.io.BufferedReader;
- import java.io.InputStreamReader;
- import java.net.URL;
- import java.net.URLConnection;
- import java.util.ArrayList;
- import java.util.HashMap;
- import java.util.Iterator;
- import java.util.List;
- import org.htmlparser.RemarkNode;
- import org.htmlparser.StringNode;
- import org.htmlparser.Node;
- import org.htmlparser.tags.*;
- import org.htmlparser.Parser;
- import org.htmlparser.filters.StringFilter;
- import org.htmlparser.util.NodeIterator;
- import org.htmlparser.util.NodeList;
- import org.htmlparser.util.ParserException;
- import java.util.Queue;
- import java.util.LinkedList;
- public class Spider implements Runnable {
- boolean search_key_words = false;
- int count = 0;
- int limitsite = 10;
- int countsite = 1;
- String keyword = "中國(guó)";//搜索關(guān)鍵字
- Parser parser = new Parser();
- // List linklist = new ArrayList();
- String startsite = "";//搜索的其實(shí)站點(diǎn)
- SearchResultBean srb;//保存搜索結(jié)果
- List resultlist = new ArrayList();//搜索到關(guān)鍵字鏈接列表
- List searchedsite = new ArrayList();//已經(jīng)被搜索站點(diǎn)列表
- Queue linklist = new LinkedList();//需解析的鏈接列表
- HashMap<String, ArrayList<String>> disallowListCache = new HashMap<String, ArrayList<String>>();
- public Spider(String keyword, String startsite) {
- this.keyword = keyword;
- this.startsite = startsite;
- linklist.add(startsite);
- srb = new SearchResultBean();
- }
- public void run() {
- // TODO Auto-generated method stub
- search(linklist);
- }
- public void search(Queue queue) {
- String url = "";
- while(!queue.isEmpty()){
- url = queue.peek().toString();//查找列隊(duì)
- try {
- if (!isSearched(searchedsite, url)) {
- if (isRobotAllowed(new URL(url)))//檢查該鏈接是否被允許搜索
- processHtml(url);
- else
- System.out.println("this page is disallowed to search");
- }
- } catch (Exception ex) {
- }
- queue.remove();
-
- }
-
- }
- /**
- * 解析HTML
- * @param url
- * @throws ParserException
- * @throws Exception
- */
- public void processHtml(String url) throws ParserException, Exception {
- searchedsite.add(url);
- count = 0;
- System.out.println("searching ... :" + url);
- parser.setURL(url);
- parser.setEncoding("GBK");
- URLConnection uc = parser.getConnection();
- uc.connect();
- //uc.getLastModified();
- NodeIterator nit = parser.elements();
-
- while (nit.hasMoreNodes()) {
- Node node = nit.nextNode();
- parserNode(node);
- }
- srb.setKeywords(keyword);
- srb.setUrl(url);
- srb.setCount_key_words(count);
- resultlist.add(srb);
- System.out.println("count keywords is :" + count);
- System.out.println("----------------------------------------------");
- }
- /**
- * 處理HTML標(biāo)簽
- * @param tag
- * @throws Exception
- */
- public void dealTag(Tag tag) throws Exception {
- NodeList list = tag.getChildren();
- if (list != null) {
- NodeIterator it = list.elements();
- while (it.hasMoreNodes()) {
- Node node = it.nextNode();
- parserNode(node);
- }
- }
- }
- /**
- * 處理HTML標(biāo)簽結(jié)點(diǎn)
- * @param node
- * @throws Exception
- */
- public void parserNode(Node node) throws Exception{
- if (node instanceof StringNode) {//判斷是否是文本結(jié)點(diǎn)
- StringNode sNode = (StringNode) node;
- StringFilter sf = new StringFilter(keyword,false);
- search_key_words = sf.accept(sNode);
- if (search_key_words) {
- count++;
- }
- // System.out.println("text is :"+sNode.getText().trim());
- } else if (node instanceof Tag) {//判斷是否是標(biāo)簽庫(kù)結(jié)點(diǎn)
- Tag atag = (Tag) node;
- if (atag instanceof TitleTag) {//判斷是否是標(biāo)TITLE結(jié)點(diǎn)
- srb.setTitle(atag.getText());
- }
- if (atag instanceof LinkTag) {//判斷是否是標(biāo)LINK結(jié)點(diǎn)
- LinkTag linkatag = (LinkTag) atag;
- checkLink(linkatag.getLink(), linklist);
- // System.out.println("-----------------this is link --------------");
- }
- dealTag(atag);
- } else if (node instanceof RemarkNode) {//判斷是否是注釋
- // System.out.println("this is remark");
- }
- }
- /*
- * 檢查鏈接是否需要加入列隊(duì)
- */
- public void checkLink(String link, Queue queue) {
- if (link != null && !link.equals("") && link.indexOf("#") == -1) {
- if (!link.startsWith("http://") && !link.startsWith("ftp://")
- && !link.startsWith("www.")) {
- link = "file:///" + link;
- } else if (link.startsWith("www.")) {
- link = "http://" + link;
- }
- if (queue.isEmpty())
- queue.add(link);
- else {
- String link_end_=link.endsWith("/")?link.substring(0,link.lastIndexOf("/")):(link+"/");
- if (!queue.contains(link)&&!queue .contains(link_end_)) {
- queue.add(link);
- }
- }
- }
- }
- /**
- * 檢查該鏈接是否已經(jīng)被掃描
- * @param list
- * @param url
- * @return
- */
- public boolean isSearched(List list, String url) {
- String url_end_ = "";
- if (url.endsWith("/")) {
- url_end_ = url.substring(0, url.lastIndexOf("/"));
- } else {
- url_end_ = url + "/";
- }
- if (list.size() > 0) {
- if (list.indexOf(url) != -1 || list.indexOf(url_end_) != -1) {
- return true;
- }
- }
- return false;
- }
- /**
- * 檢查URL是否被允許搜索
- * @param urlToCheck
- * @return
- */
- private boolean isRobotAllowed(URL urlToCheck) {
- String host = urlToCheck.getHost().toLowerCase();// 獲取給出RUL的主機(jī)
- // System.out.println("主機(jī)="+host);
- // 獲取主機(jī)不允許搜索的URL緩存
- ArrayList<String> disallowList = disallowListCache.get(host);
- // 如果還沒有緩存,下載并緩存。
- if (disallowList == null) {
- disallowList = new ArrayList<String>();
- try {
- URL robotsFileUrl = new URL("http://" + host + "/robots.txt");
- BufferedReader reader = new BufferedReader(
- new InputStreamReader(robotsFileUrl.openStream()));
- // 讀robot文件,創(chuàng)建不允許訪問的路徑列表。
- String line;
- while ((line = reader.readLine()) != null) {
- if (line.indexOf("Disallow:") == 0) {// 是否包含"Disallow:"
- String disallowPath = line.substring("Disallow:"
- .length());// 獲取不允許訪問路徑
- // 檢查是否有注釋。
- int commentIndex = disallowPath.indexOf("#");
- if (commentIndex != -1) {
- disallowPath = disallowPath.substring(0,
- commentIndex);// 去掉注釋
- }
- disallowPath = disallowPath.trim();
- disallowList.add(disallowPath);
- }
- }
- for (Iterator it = disallowList.iterator(); it.hasNext();) {
- System.out.println("Disallow is :" + it.next());
- }
- // 緩存此主機(jī)不允許訪問的路徑。
- disallowListCache.put(host, disallowList);
- } catch (Exception e) {
- return true; // web站點(diǎn)根目錄下沒有robots.txt文件,返回真
- }
- }
- String file = urlToCheck.getFile();
- // System.out.println("文件getFile()="+file);
- for (int i = 0; i < disallowList.size(); i++) {
- String disallow = disallowList.get(i);
- if (file.startsWith(disallow)) {
- return false;
- }
- }
- return true;
- }
- public static void main(String[] args) {
- Spider ph = new Spider("英超", "http://www.microsoft.com");
- try {
- // ph.processHtml();
- Thread search = new Thread(ph);
- search.start();//啟動(dòng)線程
- } catch (Exception ex) {
- }
- }
- }
- --------------------------------------SearchResultBean.java---------------------------------------------------------
- public class SearchResultBean {
- String url = "";
- String title = "";
- String keywords = "";
- int count_key_words = 0;
- public int getCount_key_words() {
- return count_key_words;
- }
- public void setCount_key_words(int count_key_words) {
- this.count_key_words = count_key_words;
- }
- public String getKeywords() {
- return keywords;
- }
- public void setKeywords(String keywords) {
- this.keywords = keywords;
- }
- public String getTitle() {
- return title;
- }
- public void setTitle(String title) {
- this.title = title;
- }
- public String getUrl() {
- return url;
- }
- public void setUrl(String url) {
- this.url = url;
- }
- }
本站僅提供存儲(chǔ)服務(wù),所有內(nèi)容均由用戶發(fā)布,如發(fā)現(xiàn)有害或侵權(quán)內(nèi)容,請(qǐng)
點(diǎn)擊舉報(bào)。