本文共 2213 字,大约阅读时间需要 7 分钟。
import timeimport urllibimport refrom urllib.request import urlopenfrom urllib import requestfrom bs4 import BeautifulSoupfrom selenium import webdriverfrom selenium.webdriver.chrome.options import Optionsdef visit_article(articles): chrome_options = Options() chrome_options.add_argument('--no-sandbox') chrome_options.add_argument('--disable-dev-shm-usage') # chrome_options.add_argument('--headless') driver = webdriver.Chrome(options=chrome_options) #driver = webdriver.Firefox() time.sleep(2) # driver.get('https://blog.csdn.net/u011503666/article/details/111756868') for article in articles: driver.get(article) time.sleep(2) driver.refresh() time.sleep(5) driver.quit()def get_page_nums(page_url): page_num = 0 while True: page_num += 1 req = request.Request(f'{page_url}/article/list/{page_num}') user_agent = 'Mozilla/5.0 (Windows NT 6.3; WOW64) '\ 'AppleWebKit/537.36 (KHTML, like Gecko) '\ 'Chrome/45.0.2454.101 Safari/537.36' req.add_header('User-Agent', user_agent) html = urlopen(req) bs_obj = BeautifulSoup(html.read(), "html.parser") article_div = bs_obj.find("div", { "class":"article-list"}) if not article_div: return page_num - 1def get_page_article_urls(page_url): req = request.Request(page_url) user_agent = 'Mozilla/5.0 (Windows NT 6.3; WOW64) '\ 'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36' req.add_header('User-Agent', user_agent) html = urlopen(req) bs_obj = BeautifulSoup(html.read(), "html.parser") articles = list() article_div = bs_obj.find("div", { "class":"article-list"}) for articlelist in article_div.findAll('a'): if 'href' in articlelist.attrs: articles.append(articlelist.attrs['href']) return articlesdef main(): page_num = get_page_nums('https://blog.csdn.net/u011503666') print(f'page_num: {page_num}') for x in range(1, page_num + 1): articles = get_page_article_urls( f'https://blog.csdn.net/u011503666/article/list/{x}') visit_article(articles)if __name__ == '__main__': main()