以下是原创代码哦,虽然并没有详细了解过爬虫,不过还是在不懈的努力(上网,翻书等)下搞定了携程网机票的爬取,回顾历程,往日的心酸真的历历在目,说真的,现在也不懂一些语法到底是怎么个应用法则,但是,这是一条呕心沥血改编来的代码,具体参考由于年代久远,而且居无定所,不得考证,甚是遗憾了。最后,公布代码会犯法吗,这样会侵犯携程网站的利益吗,为了安全,我还是保留一点吧,不过可以给应对某sir课程的小伙伴一个参考。
import pymongo
import time
from selenium import webdriver
from mon.exceptions import TimeoutException
from mon.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from pyquery import PyQuery as pq
from urllib.parse import quote
MONGO_URL = ‘localhost’
MONGO_DB = ‘taobao’
MONGO_COLLECTION = ‘products’
SERVICE_ARGS = [’–load-images=false’, ‘–disk-cache=true’]
browser = webdriver.Firefox()
wait = WebDriverWait(browser, 10)
client = pymongo.MongoClient(MONGO_URL)
db = client[MONGO_DB]
f=open(‘携程机票价格.txt’,‘w’,encoding=‘utf-8,’)
f.truncate()
today_timestamp=time.time()
today=time.strftime(’%Y-%m-%d’,time.localtime(today_timestamp))
print(today)
f.write(today)
f.write(’\n’)
def index_page(page):
“”"
抓取索引页
:param page: 页码
“”"
print(‘正在爬取第’, page, ‘页’)
try:
url = '/itinerary/oneway/cgo-bjs?date=' + todayprint(url)browser.get(url)print(browser.title)title=browser.title.split('-')[0]print(title)f.write(title)f.write('\n')f.write('Price')f.write('Date')f.write('\n')print('OK')get_products()except TimeoutException:print('NO!!!')exit()
def get_products():
html = browser.page_sourcedoc = pq(html)items = doc('#app .slick-list .li').items()print('从郑州到北京的飞机票')for item in items:product = {'price': item.find('.base_price02').text(),'date': item.find('.calendar_date').text(),}price=product['price'][1:].ljust(13)date=product['date'][3:5]todaya=today[8:10]if (product['price']=='')&(date!=todaya):f.close()browser.close()exit()print(product)f.write('¥')f.write(price)f.write(product['date'])f.write('\n')
def main():
“”"
遍历每一页
“”"
index_page(1)
browser.close()
ifname== ‘main’:
f=open(‘携程机票价格.txt’,‘w’)
main()
f.close()