爬取糗事百科段子

2020-07-17 23:06:59LanceLee数据爬虫827

- N +
# 导进requests 和 BeautifulSoup
import requests
from bs4 import BeautifulSoup

def download_page(url):
    # 界定头顶部	，用于骗得电脑浏览器
    headers ={'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'}
    # 这儿我是用了代理，就是我当地电脑跑的一个程序流程，能够任意找寻一个代理IP地址
    # 抓取很多数据信息的情况下会采用
    # PROXY_POOL_URL = 'http://localhost:5555/random'
    # response = requests.get(PROXY_POOL_URL)
    # proxies = {"http:": "http://"   response.text}
    # html = requests.get(url,headers = headers,proxies = proxies)
    # 浏览网页页面并获取HTML文档
    html = requests.get(url,headers = headers)
    return html.text
def get_content(html,page):
    # 从回到的HTML网页页面中寻找必须的作者，搞笑段子	，年纪等信息
    output = """第{}页 作者：{} 性別：{} 年纪：{} 关注：{} 评价：{}\n{}\n------------\n"""  # 最后輸出文件格式
    # 做一火锅锅底。
    soup = BeautifulSoup(html,'lxml')
    # 寻找每一页每一个搞笑段子的信息
    content = soup.find(id = 'content')
    content_list = content.find_all('div',class_ = 'article')
    # 循环系统解析xml每一个搞笑段子的信息
    for index in content_list:
        # 查寻出作者的呢称
        author = index.find('h2').string
        # 获取搞笑段子內容
        content = index.find('div', class_= 'content').find('span').get_text()  # 获取內容
        # 获取关注和评价数的标识
        stats = index.find('div',class_ = 'stats')
        # 获取关注数
        dianzan = stats.find('span',class_ = 'stats-vote').find('i').string
        # 获取评价数
        pinglun = stats.find('span',class_ = 'stats-comments').find('a').find('i').string
        # 获取作者的性別和年纪
        author_info = index.find('div',class_ = 'articleGender')
        # 这儿最先分辨作者是不是密名
        if author_info is not None:
            class_list = author_info['class']
            # 依据标识来分辨作者的性別
            if 'womenIcon' in class_list:
                gender = '女'
            elif 'manIcon' in class_list:
                gender = '男'
            else:
                gender = ''
            age = author_info.string
        else:
            gender = ''
            age = ''
        # 调用函数将数据信息载入文档中
        save_text(output.format(page,author,gender,age,dianzan,pinglun,content))
# 将数据信息载入文档中的涵数
def save_text(*args):
    # 解析xml进出的每一组数据信息，随后先后载入
    for index in args:
        with open(r"D:\python\qiushibaike.txt","a",encoding = "utf-8") as f:
            f.write(index)

def main():
    # 主函数，循环系统查寻能够查寻许多页
    for index in range(1,2):
        # 最先界定url地址
        url = "https://qiushibaike.com/text/page/{}".format(index)
        # 调用函数下载页面
        html = download_page(url)
        # 调用函数获取大家必须的数据信息
        get_content(html,index)

if __name__ == "__main__":
    main()
文章来源于网络，如有侵权请联系站长QQ61910465删除
本文版权归趣快排营销www.SEOguruBLOG.com 所有,如有转发请注明来出,竞价开户托管,seo优化请联系QQ㊣61910465