今天上传一个关于公交线路爬虫的代码。
话不多说,直接讲思路:对网站进行分析,分析URL--------->定位“切换城市”并点击------>根据输入的字符串进行定位城市并点击,定位不到返回错误信息------->获取对应城市的公交线路的各个分类------->对各个分类进行点击,获取分类下的每一个线路的链接,并存入字典中------>循环访问每一个链接,并且把匹配到的内容写入表格中。
爬取的网站是:www.8684.cn
下面是代码:
1 #!/usr/bin/python3 2 # -*- coding: utf-8 -*- 3 4 from selenium import webdriver 5 from selenium.webdriver.common.action_chains import ActionChains 6 from selenium.webdriver.common.by import By 7 from selenium.webdriver.support.wait import WebDriverWait 8 from selenium.webdriver.support import expected_conditions as EC 9 import xlwt 10 def search_bus(city): 11 #访问主页 12 url = 'http://beijing.8684.cn/line1' 13 browser = webdriver.Chrome() 14 browser.implicitly_wait(10) 15 browser.get(url) 16 browser.maximize_window() 17 #点击切换城市。 18 a = browser.find_element_by_xpath('//span[@class="city_switch"]') 19 WebDriverWait(browser,15,0.3).until(EC.presence_of_element_located((By.XPATH,'//span[@class="city_switch"]'))) 20 ActionChains(browser).click(a).perform() 21 #点击城市 22 b = browser.find_element_by_link_text(city) 23 WebDriverWait(browser,15,0.4).until(EC.visibility_of_element_located((By.LINK_TEXT,city))) 24 ActionChains(browser).click(b).perform() 25 d = browser.current_url 26 #点击一个进入另一界面 27 c = browser.find_elements_by_xpath('//div[@class="bus_layer"]//div[4]//div[@class="bus_layer_r"]//a') 28 WebDriverWait(browser,15,0.3).until(EC.presence_of_all_elements_located((By.XPATH,'//div[@class="bus_layer"]//div[4]//div[@class="bus_layer_r"]//a'))) 29 #获取每一项的链接 30 url_dict = {} 31 for x in c: 32 url_dict[x.text] = x.get_attribute('href') 33 browser.quit() 34 return url_dict 35 def go_url(dicts): 36 dict_1 = {} 37 for k,x in dicts.items(): 38 browser = webdriver.Chrome() 39 browser.get(x) 40 list_1 = browser.find_elements_by_xpath('//div[@id="con_site_1"][@class="stie_list"]//a') 41 WebDriverWait(browser,15,0.3).until(EC.presence_of_all_elements_located((By.XPATH,'//div[@id="con_site_1"][@class="stie_list"]//a'))) 42 dict_2 = {} 43 for x in list_1: 44 dict_2[x.text] = x.get_attribute('href') 45 dict_1[k] = dict_2 46 browser.quit() 47 return dict_1 48 def write_mei_ge(dict_1,city): 49 work = xlwt.Workbook() 50 for k,v in dict_1.items(): 51 table = work.add_sheet(k) 52 t = 0 53 g = 0 54 for ka,va in v.items(): 55 try: 56 browser = webdriver.Chrome() 57 browser.get(va) 58 list_1 = browser.find_elements_by_xpath('//div[@class="bus_line_site "][1]//a') 59 WebDriverWait(browser,15,0.3).until(EC.presence_of_all_elements_located((By.XPATH,'//div[@class="bus_line_site "][1]//a'))) 60 num = len(list_1) 61 table.write(t,0,ka) 62 for w in range(num): 63 table.write(t,w+1,list_1[w].text) 64 t += 1 65 browser.quit() 66 except: 67 print('%s is false' % ka) 68 #g += 1 69 continue 70 return work.save('%s.xls' % city) |
分段:
1.导入模块,这个就不用讲了,都懂。。。
#!/usr/bin/python3 # -*- coding: utf-8 -*- from selenium import webdriver from selenium.webdriver.common.action_chains import ActionChains from selenium.webdriver.common.by import By from selenium.webdriver.support.wait import WebDriverWait from selenium.webdriver.support import expected_conditions as EC import xlwt |
2. 创建函数,形参为city,传入实参-----城市名。最后返回一个字典,字典值是每个分类对应的链接,键是分类名。
def search_bus(city): #访问主页 url = 'http://beijing.8684.cn/line1' browser = webdriver.Chrome() browser.implicitly_wait(10) browser.get(url) browser.maximize_window() #点击切换城市。 a = browser.find_element_by_xpath('//span[@class="city_switch"]') WebDriverWait(browser,15,0.3).until(EC.presence_of_element_located((By.XPATH,'//span[@class="city_switch"]'))) ActionChains(browser).click(a).perform() #点击城市 b = browser.find_element_by_link_text(city) WebDriverWait(browser,15,0.4).until(EC.visibility_of_element_located((By.LINK_TEXT,city))) ActionChains(browser).click(b).perform() d = browser.current_url #点击一个进入另一界面 c = browser.find_elements_by_xpath('//div[@class="bus_layer"]//div[4]//div[@class="bus_layer_r"]//a') WebDriverWait(browser,15,0.3).until(EC.presence_of_all_elements_located((By.XPATH,'//div[@class="bus_layer"]//div[4]//div[@class="bus_layer_r"]//a'))) #获取每一项的链接 url_dict = {} for x in c: url_dict[x.text] = x.get_attribute('href') browser.quit() return url_dict |
3.对每个键对应的链接进行访问,获得每个线路对应的链接,并存入字典中。此处字典为一个字典嵌套一个字典。举个例子:dict = {'分类名1':{’1路‘:’http://xxxxxxx.cn/wwwww‘},’分类名2‘:{’20路‘:’http://ssss.cn........‘,’35路‘:’http://ww.ddff.cn/ddffffff‘}}
最后返回一个字典。
1 def go_url(dicts): 2 dict_1 = {} 3 for k,x in dicts.items(): 4 browser = webdriver.Chrome() 5 browser.get(x) 6 list_1 = browser.find_elements_by_xpath('//div[@id="con_site_1"][@class="stie_list"]//a') 7 WebDriverWait(browser,15,0.3).until(EC.presence_of_all_elements_located((By.XPATH,'//div[@id="con_site_1"][@class="stie_list"]//a'))) 8 dict_2 = {} 9 for x in list_1: 10 dict_2[x.text] = x.get_attribute('href') 11 dict_1[k] = dict_2 12 browser.quit() 13 return dict_1 |
4.最后对之前存储的每个链接进行访问,获取每条公交线路,并写入Xls文件中。
#形参1是上一个函数返回的字典,形参2是所要查询的城市的名字。 def write_1(dict_1,city): work = xlwt.Workbook() for k,v in dict_1.items(): table = work.add_sheet(k) t = 0 for ka,va in v.items(): try: browser = webdriver.Chrome() browser.get(va) list_1 = browser.find_elements_by_xpath('//div[@class="bus_line_site "][1]//a') WebDriverWait(browser,15,0.3).until(EC.presence_of_all_elements_located((By.XPATH,'//div[@class="bus_line_site "][1]//a'))) num = len(list_1) table.write(t,0,ka) for w in range(num): table.write(t,w+1,list_1[w].text) t += 1 browser.quit() except: print('%s is false' % ka) #g += 1 continue return work.save('%s.xls' % city) |
到此所有代码就上传完成了,如果有不解的地方或需要改进的地方,请您提出来,很高兴与你一起交流。
谢谢阅读。
上文内容不用于商业目的,如涉及知识产权问题,请权利人联系博为峰小编(021-64471599-8017),我们将立即处理。