300字范文,内容丰富有趣,生活中的好帮手!
300字范文 > 网络爬虫--25.【selenium实战】实现拉勾网爬虫之--selenium获取数据

网络爬虫--25.【selenium实战】实现拉勾网爬虫之--selenium获取数据

时间:2024-07-29 03:52:43

相关推荐

网络爬虫--25.【selenium实战】实现拉勾网爬虫之--selenium获取数据

代码实现

#encoding: utf-8from selenium import webdriverfrom lxml import etreeimport reimport timefrom selenium.webdriver.support.ui import WebDriverWaitfrom selenium.webdriver.support import expected_conditions as ECfrom mon.by import Byclass LagouSpider(object):driver_path = r"D:\Program Files\chromedriver_win32\chromedriver.exe"def __init__(self):self.driver = webdriver.Chrome(executable_path=LagouSpider.driver_path)self.url = '/jobs/list_python?labelWords=&fromSearch=true&suginput='self.positions = []def run(self):self.driver.get(self.url)while True:source = self.driver.page_sourceWebDriverWait(driver=self.driver,timeout=10).until(EC.presence_of_element_located((By.XPATH,"//div[@class='pager_container']/span[last()]")))self.parse_list_page(source)try:next_btn = self.driver.find_element_by_xpath("//div[@class='pager_container']/span[last()]")if "pager_next_disabled" in next_btn.get_attribute("class"):breakelse:next_btn.click()except:print(source)time.sleep(1)def parse_list_page(self,source):html = etree.HTML(source)links = html.xpath("//a[@class='position_link']/@href")for link in links:self.request_detail_page(link)time.sleep(1)def request_detail_page(self,url):# self.driver.get(url)self.driver.execute_script("window.open('%s')"%url)self.driver.switch_to.window(self.driver.window_handles[1])WebDriverWait(self.driver,timeout=10).until(EC.presence_of_element_located((By.XPATH,"//div[@class='job-name']/span[@class='name']")))source = self.driver.page_sourceself.parse_detail_page(source)# 关闭当前这个详情页self.driver.close()# 继续切换回职位列表页self.driver.switch_to.window(self.driver.window_handles[0])def parse_detail_page(self,source):html = etree.HTML(source)position_name = html.xpath("//span[@class='name']/text()")[0]job_request_spans = html.xpath("//dd[@class='job_request']//span")salary = job_request_spans[0].xpath('.//text()')[0].strip()city = job_request_spans[1].xpath(".//text()")[0].strip()city = re.sub(r"[\s/]", "", city)work_years = job_request_spans[2].xpath(".//text()")[0].strip()work_years = re.sub(r"[\s/]", "", work_years)education = job_request_spans[3].xpath(".//text()")[0].strip()education = re.sub(r"[\s/]", "", education)desc = "".join(html.xpath("//dd[@class='job_bt']//text()")).strip()company_name = html.xpath("//h2[@class='fl']/text()")[0].strip()position = {'name': position_name,'company_name': company_name,'salary': salary,'city': city,'work_years': work_years,'education': education,'desc': desc}self.positions.append(position)print(position)print('='*40)if __name__ == '__main__':spider = LagouSpider()spider.run()

本内容不代表本网观点和政治立场,如有侵犯你的权益请联系我们处理。
网友评论
网友评论仅供其表达个人看法,并不表明网站立场。