博主帮一个朋友做论文,要分析知乎的问答数据,数据量不多,因此简单用selenium中关于鼠标下滑window.scrollTo方法爬取了知乎的‘’抑郁症“专题相关问答

一、python开头引入的模块


import requests,json,random

try:
   import
cookielib
except:
   import
http.cookiejar as cookielib
import os.path
try:
   from
PIL import Image
except:
   pass
import
time,csv,xml,re
from selenium import webdriver
from selenium.webdriver.chrome.options import Options # options模块中调用Options
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup

chrome_options = Options() # 实例化Option对象
chrome_options.add_argument('--headless') # Chrome浏览器设置为静默模式
driver = webdriver.Chrome() # 设置引擎为Chrome,在后台默默运行#options = chrome_options



二、知乎python登陆验证

# # 构造 Request headers
#
从配置表获取
ua_list = [
       
"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3510.2 Safari/537.36",
       
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.108 Safari/537.36 2345Explorer/8.8.3.16721",
       
"Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
       
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
       
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
       
"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
       
"Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
       
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:45.0) Gecko/20100101 Firefox/45.0',
       
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)',
       
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:52.0) Gecko/20100101 Firefox/52.0',
       
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1)',
       
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Trident/6.0)',
       
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)',
   ]
ua = random.choice(ua_list)
headers = {

   
"Host": "www.zhihu.com",

   
"Referer": "https://www.zhihu.com/topic/19564862/hot",

   
'User-Agent': str(ua)
}
#代理ip
代理ip  json格式的url
url_ip=''
resp = requests.get(url=url_ip)
proxies_list=[]
if resp.status_code == 200:
   
data_json = resp.json()
   
for d in data_json['obj']:
       
port = d['port']
       
ip = d['ip']
       
full_ip = ip + ':' + port
       dict ={'http':full_ip}
       
proxies_list.append(dict)
proxies ,
   
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36',
}
# 使用登录cookie信息

session = requests.session()
session.cookies = cookielib.LWPCookieJar(filename=cookie_file)

try:
   
session.cookies.load(ignore_discard=True)
except:
   
print("Cookie 未能加载")


def get_xsrf():
   
'''_xsrf 是一个动态变化的参数'''

   
index_url = 'https://www.zhihu.com'

   
# 获取登录时需要用到的_xsrf

   
index_page = session.get(index_url, headers=header)

   
# html = index_page.cookies

   # pattern = r'name="_xsrf" value="(.*?)"'
   #
   # #
这里的_xsrf 返回的是一个list
   #
   # _xsrf = re.findall(pattern, html)
   
xsrf = index_page.request._cookies.get("_xsrf")
   
return xsrf

# 获取验证码

def get_captcha():
   
t = str(int(time.time() * 1000))

   
captcha_url = 'https://www.zhihu.com/captcha.gif?r=' + t + "&type=login"

   
r = session.get(captcha_url, headers=header)

   
with open('captcha.jpg', 'wb') as f:

       
f.write(r.content)

       
f.close()

   
# pillow Image 显示验证码

   
# 如果没有安装 pillow 到源代码所在的目录去找到验证码然后手动输入

   
try:

       
im = Image.open('captcha.jpg')

       
im.show()

       
im.close()

   
except:

       
print(u'请到 %s 目录找到captcha.jpg 手动输入' % os.path.abspath('captcha.jpg'))

   
captcha = input("please input the captcha\n>")

   
return captcha

def isLogin():
   
# 通过查看用户个人信息来判断是否已经登录

   
url = "https://www.zhihu.com/settings/profile"

   
login_code = session.get(url, headers=header, allow_redirects=False).status_code

   if login_code == 200:

       return True

   else:

       return False

def
login(secret, account):
   
# 通过输入的用户名判断是否是手机号

   
if re.match(r"^1\d{10}$", account):

       
print("手机号登录 \n")

       
post_url = 'https://www.zhihu.com/login/phone_num'

       
postdata = {
           
'_xsrf': get_xsrf(),

           
'password': secret,

           
'remember_me': 'true',

           
'phone_num': account,

       }

   
else:

       if
"@" in account:

           
print("邮箱登录 \n")

       
else:

           
print("你的账号输入有问题,请重新登录")

           
return 0

       
post_url = 'https://www.zhihu.com/login/email'

       
postdata = {

           
'_xsrf': get_xsrf(),

           
'password': secret,

           
'remember_me': 'true',

           
'email': account,

       }

   
try:

       
# 不需要验证码直接登录成功

       
login_page = session.post(post_url, data=postdata, headers=login_code = eval(login_page.text)


   
cookie_path = cookie_file
   session.cookies.save(cookie_path)
# try:
#     input = raw_input
# except:
#     pass
##
将主页面的用户提问printshell

def getpage(url2):
   
mainpage = session.get(url2, headers=header)

   
soup = BeautifulSoup(mainpage.text, 'html.parser')

   
tags = soup.find_all("a", class_="question_link")

   
# print tags

def get_login_cookie(url):
   
'''
   
获取保存cookie
   :param url:
   :return:
   '''

   
if not os.path.exists(cookie_file):
       
# account = input('请输入你的用户名\n>  ')
       # secret = input("
请输入你的密码\n>  ")
       
user_name = account
       passwd = secret
       login(passwd, user_name)
   
try:
       
cookie_jar = cookielib.LWPCookieJar(cookie_file)
       
cookie_jar.load(ignore_discard=True, ignore_expires=True)
       
print('Load cookie succeeded')
   
except cookielib.LoadError:
       return None
   else:
       
cookie_d = {}
       
for cookie in cookie_jar:
           
domain = cookie.domain
           if url.find(domain) > 0:
               
cookie_d[cookie.name] = cookie.value
       return cookie_d

if __name__ == '__main__':

   if
isLogin():

       
print('您已经登录')

       
url2 = 'https://www.zhihu.com'

       
getpage(url2)

   
else:

       
# account = input('请输入你的用户名\n>  ')
       #
       # secret = input("
请输入你的密码\n>  ")

       
login(secret, account)

三、window.scrollTo方法获取页面数据

url='https://www.zhihu.com/topic/19564862/questions'
driver.get(url)
time.sleep(3)

while True:

   
res =driver.page_source
   html =BeautifulSoup(res,'html.parser')
   
items= html.find_all('div',class_='List-item TopicFeedItem')
   
driver.execute_script('window.scrollTo(0,10000000)')
   
print(len(items))

# 判断数据文件 数量需要多少数据 就判断是多少
   
if len(items)>=2000:

       break


四、python采集数据(数据采集部分,博主太懒了,没有打包函数,因为采集少量数据,没有scrapy框架或者用gevent和queue多线程和协程爬虫去做,如果有需要后续可以自己去打包)

# 最终页面 selenium driver方法获取

res = driver.page_source
html = BeautifulSoup(res, 'html.parser')
items = html.find_all('div', class_='List-item TopicFeedItem')
for item in items:
   
# excel =[]
   # title = item.find('h2').text
   # content =item.find('div',class_='RichContent-inner').text
   
url1='https://www.zhihu.com'+ item.find('h2'<span style="color: #63A35>通过正则获取问题id

   
driver.get(url1)
   
res = driver.page_source
   html = BeautifulSoup(res, 'html.parser')
   
# 太懒了下面部分没有打包称函数,自行打包
   
#问章详情页采集,没有描述的不采集
   
if html.find('div', class_='QuestionHeader-detail') != None:
       
# 判断是否存在描述

       
if html.find('button', class_='QuestionRichText-more') != None:
           
# 判断是否有阅读全文按钮

           
driver.find_element_by_class_name('QuestionRichText-more').click()
           
# 通过点击事件打开描述全文
           
time.sleep(1)
           
title = driver.find_element_by_class_name('QuestionHeader').find_element_by_tag_name('h1').text
           des = driver.find_element_by_class_name('QuestionHeader-detail').text
           comment = html.find('button', class_='Button Button--plain Button--withIcon Button--withLabel').text.replace('\u200b', '评论数量:')
           
# 获取script标签文件下面的数据
           
script = html.find('script', id='js-initialData').text
           dict = eval(script)['initialState']['entities']['questions'][str(author_id)]['author']
           
# 将字符串转化为字典 并且获取数据
           
timeStamp = eval(script)['initialState']['entities']['questions'][str(author_id)]['updatedTime']
           
timeArray = time<span style="color: #63A35C; fon
本文版权归趣营销www.SEOgUrublog.com 所有,如有转发请注明来出,竞价开户托管,seo优化请联系QQ卍61910465