from urllib import requestfrom io import BytesIOimport gzipimport reclass Spider(): def __init__(self): self.url='https://www.douyu.com/g_LOL' self.root_pattern='<div class="DyListCover-info"><span class="DyListCover-hot is-template"><svg><use xlink:href="#icon-hot_8a57f0b"></use></svg>([\s\S]*?)</h2></div>' self.number_pattern='([\s\S]*?)</span>' self.name_pattern='</use></svg>([\s\S]*?)' def __fetch_content(self): headers={'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36'} page1=request.Request(self.url,headers=headers) r=request.urlopen(page1)#加入瀏覽器信息 htmls=r.read()#獲取字節(jié)碼 buff=BytesIO(htmls) f=gzip.GzipFile(fileobj=buff) htmls=f.read().decode('utf-8')#數(shù)據(jù)被壓縮過,我們要對(duì)數(shù)據(jù)進(jìn)行處理。 return htmls def __analysis(self,htmls): root_htmls=re.findall(self.root_pattern,htmls) anchors=[] for origin_html in root_htmls: new_html=origin_html.replace('</span><h2 class="DyListCover-user is-template"><svg><use xlink:href="#icon-user_c95acf8"></use></svg>','') anchors.append(new_html) print(anchors) def go(self): htmls=self.__fetch_content() self.__analysis(htmls) spider=Spider()spider.go()
以前覺得爬蟲很難,完成了一個(gè)小目標(biāo)之后,覺得有點(diǎn)小放松。
但內(nèi)心卻似乎感覺很朦朧。
只是冰山一角而已。
聯(lián)系客服