為了便于使用及日后的擴(kuò)展,將Scrapy簡單封裝為了Requester,具體代碼如下:
using System;using System.Collections.Generic;using Crawler.Common;namespace Crawler.Protocol{ public class Requester { private Uri Url { get; set; } private Browser Browser { get; set; } public Requester(string url, Dictionary<string, string> headers = null, Browser browser = null) { var u = new Uri(url); //檢測(cè)地址是域名還是IP地址,如果是域名,則使用DnsResolver解析為IP地址 var leftPart = u.GetLeftPart(UriPartial.Authority).Replace(u.GetLeftPart(UriPartial.Scheme), ''); //正則匹配是否為IP地址 if (!RegexHelper.IsMatch(leftPart, @'\d+\.\d+\.\d+\.\d+\w')) { var dns = new DnsResolver(leftPart); if (dns.IsSuccess) u = new Uri(url.Replace(leftPart, dns.Record.Address.ToString())); } Url = u; Browser = browser ?? new Browser(); if (headers == null) return; foreach (var header in headers) Browser.Headers[header.Key] = header.Value; } public string GetHtml() { return Browser.DownloadString(Url); } public byte[] GetFile() { return Browser.NavigateToPage(Url).RawResponse.Body; } }}
考慮到可能對(duì)ScrapyBrowser做一些擴(kuò)展(例如增加對(duì)FTP等其他協(xié)議的支持),故新建了Browser類繼承自ScrapyBrowser類:
using ScrapySharp.Network;namespace Crawler.Protocol{ public class Browser : ScrapingBrowser { }}
原文:http://www.cnblogs.com/JiaoWoWeiZai/p/5866977.html
聯(lián)系客服