最近嘗試下載小說,代碼如下:
import requests import os from bs4 import BeautifulSoup import time # 設(shè)置列表,用以存儲每本書籍的信息 data_dict=[] #設(shè)置請求頭 hearders={'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36' } # 設(shè)置頁碼 page_number page_number=1 # while 循環(huán)的條件設(shè)置為 page_number 的值是否小于10 while page_number<10: # 設(shè)置要請求的網(wǎng)頁鏈接 url='https://www.bbiquge.net/book/88109/index_{}.html'.format(page_number) # 請求網(wǎng)頁 res= requests.get(url,headers=hearders,verify=False) #print(res.status_code) #暫停6s time.sleep(6) #解析網(wǎng)站 bs= BeautifulSoup(res.text,'html.parser') #尋找父節(jié)點 bookname_tag=bs.find('dl') #尋找包含所有信息的子節(jié)點 bookname_list=bs.find_all('dd') # 使用 for 循環(huán)遍歷搜索結(jié)果 for bookname in bookname_list: # 提取章節(jié)名稱 title=bookname.text # 提取書籍鏈接 book_href=bookname.find('a')['href'] #print(book_href) #拼接,形成章節(jié)詳情頁 url_1='https://www.bbiquge.net/book/88109/'+str(book_href) # 請求章節(jié)詳情頁 res_1=requests.get(url_1,verify=False) #暫停6s time.sleep(6) #print(res_1.status_code) # 解析章節(jié)詳情頁 bs_1=BeautifulSoup(res_1.text,'html.parser') #尋找包含所有信息的Tag節(jié)點 content_tag= bs_1.find('div',id='content').text data_dict.append(content_tag) with open(title+'.txt', 'w', encoding='utf-8') as f: for content in data_dict: content = content.replace("\n","") #將每一章的內(nèi)容寫入對應(yīng)章節(jié)的txt文件里 f.write(content) print(str(title)+"下載成功!") #頁面自增 page_number += 1
按照請求網(wǎng)址-解析網(wǎng)址-存儲數(shù)據(jù)的步驟,順利取得數(shù)據(jù)。但由于請求過于頻繁,會被服務(wù)器中斷連接。解決辦法為:
(1)在headers中不適用持久連接:
(2)設(shè)置訪問頻率,可以長一點:
(3)隨機(jī)切換User-Agent:
(4)改變重連次數(shù) requests.adapters.DEFAULT_RETRIES = 5
(5)關(guān)閉 SSL 驗證 verify=False
response = requests.get(fpath_or_url,headers=headers,stream=True, verify=False)