Selenium爬取煎蛋网

发表于：2019-6-24 21:09

字体：大中小 | 上一篇 | 下一篇 | 我要投稿

作者：cmap 来源：博客园

Selenium

自动化测试工具

　　今天给大家分享如何用selenium爬取煎蛋网？

　　直接上代码

from selenium import webdriver

　　from selenium.webdriver.support.ui import WebDriverWait

　　from selenium.webdriver.common.by import By

　　from selenium.webdriver.support import expected_conditions as ES

　　import requests

　　import urllib.request

　　import os

　　from lxml import etree

　　t = 0

　　class Custer(object):

　　driver_path = r"D:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe"

　　def __init__(self):

　　self.driver = webdriver.Chrome(executable_path=self.driver_path)

　　self.url = "http://jandan.net/ooxx"

　　def run(self):

　　self.driver.get(self.url)

　　while True:

　　all_source = self.driver.page_source

　　html = etree.HTML(all_source)

　　self.xqy(html)

　　WebDriverWait(self.driver,10).until(

　　ES.presence_of_element_located((By.XPATH,"//div[@class='cp-pagenavi']/a[last()]"))

　　)

　　try:

　　Btn = self.driver.find_element_by_xpath("//div[@class='cp-pagenavi']/a[last()]")

　　if "Older Comments" in Btn.get_attribute("title"):

　　Btn.click()

　　else:

　　break

　　except:

　　print("出现异常")

　　def xqy(self,html):

　　all_content = html.xpath("//div[@class='row']//div")

　　all_author = all_content[0].xpath("//div[@class='author']/strong/text()") #作者列表

　　#*****************给自己的重点**********************

　　#给列表重复元素加工如果不加工进入字典会少很多元素

　　for index,item in enumerate(all_author):

　　global t

　　if item in all_author[0:index]: #判断当前元素是否与之前元素重复如果重复，则重命名

　　t=t+1

　　all_author[index] = item+str(t) #如多个重命名使作者加上字符1 依次类推

　　#***************************************************

　　WebDriverWait(self.driver, 10).until(

　　ES.presence_of_element_located((By.XPATH, "//div[@class='text']//img"))

　　)

　　all_img = all_content[1].xpath("//div[@class='text']//img//@src") #图片列表

　　#解决有个张图片没有http：协议

　　for index,item in enumerate(all_img):

　　if 'http:' not in item:

　　all_img[index] = 'http:'+item

　　dic = dict(zip(all_author,all_img)) #多个列表生产字典

　　#遍历字典保存图片

　　for key in dic:

　　hz = os.path.splitext(dic[key])[1] #取出后缀名.jpg/.png

　　filename = key+hz #文件名（标题+后缀名）

　　urllib.request.urlretrieve(dic[key],'images/'+filename)

　　def main():

　　rea = Custer()

　　rea.run()

　　if __name__ == '__main__':

　　main()

　　爬取的图片

　　进阶

　　个人用了个多线程但不知道是不是多线程爬取　感觉爬取速度快多了