300字范文,内容丰富有趣,生活中的好帮手!
300字范文 > 豆瓣Top250电影信息海报爬虫

豆瓣Top250电影信息海报爬虫

时间:2020-01-22 06:55:26

相关推荐

豆瓣Top250电影信息海报爬虫

前些时候老师布置了一个小作业,让爬取豆瓣top250电影的相关信息,把每一部电影的信息以txt文本保存,并下载电影海报图片,一部电影创建一个文件夹。

代码编写分为如下几步

1.创建文件夹(我是在D盘先创建了一个文件夹)

print("开始创建文件夹……")for r in range(250):v_foleder="D://豆瓣top250"Vname_list=os.listdir(v_foleder)# 指定路径创建新文件夹file_path='D://豆瓣top250'+"//"+str(r+1)r=r+1if not os.path.exists(file_path): # 判断文件夹是否已经存在os.mkdir(file_path)else:print(file_path + ' 目录已存在')print("创建文件夹完成")

2.访问网站

def getHTMLText(url):try:r = requests.get(url,headers=headers,timeout=30)r.raise_for_status()r.encoding = r.apparent_encodingreturn r.textexcept:return '产生异常'

3.爬取文本信息并保存到相应文件夹

def getInfo(url):html = requests.get(url, headers=headers, timeout=30)# print(html.status_code)root = etree.HTML(html.content)# <div class="info">for InformationBlock in root.xpath('//div[@class="info"]'):# 影片名称title = InformationBlock.xpath('div[@class="hd"]/a/span[@class="title"]/text()')[0]# print(title)# <div class="bd">块,导演、演员、上映时间、制片国家、影片类型在一个<p>里info = InformationBlock.xpath('div[@class="bd"]/p[1]/text()')# print(type(info))# 导演和主演try:directorandStar = info[0].replace(" ", "").replace("\n", "").replace(".","").split("/")[0]except:directorandStar = " "# 上映日期date ="上映日期:"+ info[1].replace(" ", "").replace("\n", "").split("/")[0]# 制片国家country ="制片国家:"+ info[1].replace("", "").replace("\n", "").split("/")[1]# 影片类型geners ="影片类型:"+ info[1].replace("", "").replace("\r\n", "").split("/")[2]# 评分rate = "评分:"+InformationBlock.xpath('div[@class="bd"]/div/span[2]/text()')[0]# 评论人数comCount ="评论人数:"+ InformationBlock.xpath('div[@class="bd"]/div/span[4]/text()')[0]#寄语try:quote = "寄语:"+InformationBlock.xpath('div[@class="bd"]/p[2]/span[1]/text()')[0]except:quote = " "global Top#shell显示进度print(Top, title,directorandStar, rate, date, country, geners, comCount, quote)print("---------------------------------------------------------")datas= (("{},{},{},{},{},{},{},{},{}").format(Top, title,directorandStar, rate, date, country, geners, comCount, quote))base='D://豆瓣top250'+"//"+str(Top)path = base +"//"+ title+".txt"with open(path, "w", newline="", encoding="utf_8_sig") as f:f.write(datas)Top += 1

4.下载图片并保存到相应文件夹

if __name__ == '__main__':i = 0urls = ['/top250?start='+str(n)+'&filter=' for n in range(0,250,25)]for url in urls:r = getHTMLText(url)soup = BeautifulSoup(r,'html.parser')titles = soup.select('div.hd a')rates = soup.select('span.rating_num')pics = soup.select('img[width="100"]')for title,rate,pic in zip(titles,rates,pics):data={'title':list(title.stripped_strings),'rate':rate.get_text(),'pic':pic.get('src')}i+=1middlename = str(i)+"//"fileName=data['title'][0]+'.jpg'pic1 = requests.get(data['pic'])with open("D://豆瓣top250//"+middlename+fileName,'wb') as photo:photo.write(pic1.content)#shell显示进度print("图片"+str(i)+"下载完成")

豆瓣官网有时爬虫会进不去,所以需要加个headers或者使用ip代理,这里我使用的是headers,大部分情况下这个方法足够了。

headers = {"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) ''AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'}

完整代码如下

from lxml import etreeimport requestsimport os,sysfrom bs4 import BeautifulSoupheaders = {"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) ''AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'}Top = 1#先创建文件夹print("开始创建文件夹……")for r in range(250):v_foleder="D://豆瓣top250"Vname_list=os.listdir(v_foleder)# 指定路径创建新文件夹file_path='D://豆瓣top250'+"//"+str(r+1)r=r+1if not os.path.exists(file_path): # 判断文件夹是否已经存在os.mkdir(file_path)else:print(file_path + ' 目录已存在')print("创建文件夹完成")# 获取文本信息def getHTMLText(url):try:r = requests.get(url,headers=headers,timeout=30)r.raise_for_status()r.encoding = r.apparent_encodingreturn r.textexcept:return '产生异常'def getInfo(url):html = requests.get(url, headers=headers, timeout=30)# print(html.status_code)root = etree.HTML(html.content)# <div class="info">for InformationBlock in root.xpath('//div[@class="info"]'):# 影片名称title = InformationBlock.xpath('div[@class="hd"]/a/span[@class="title"]/text()')[0]# print(title)# <div class="bd">块,导演、演员、上映时间、制片国家、影片类型在一个<p>里info = InformationBlock.xpath('div[@class="bd"]/p[1]/text()')# print(type(info))# 导演和主演try:directorandStar = info[0].replace(" ", "").replace("\n", "").replace(".","").split("/")[0]except:directorandStar = " "# 上映日期date ="上映日期:"+ info[1].replace(" ", "").replace("\n", "").split("/")[0]# 制片国家country ="制片国家:"+ info[1].replace("", "").replace("\n", "").split("/")[1]# 影片类型geners ="影片类型:"+ info[1].replace("", "").replace("\r\n", "").split("/")[2]# 评分rate = "评分:"+InformationBlock.xpath('div[@class="bd"]/div/span[2]/text()')[0]# 评论人数comCount ="评论人数:"+ InformationBlock.xpath('div[@class="bd"]/div/span[4]/text()')[0]#寄语try:quote = "寄语:"+InformationBlock.xpath('div[@class="bd"]/p[2]/span[1]/text()')[0]except:quote = " "global Top#shell显示进度print(Top, title,directorandStar, rate, date, country, geners, comCount, quote)print("---------------------------------------------------------")datas= (("{},{},{},{},{},{},{},{},{}").format(Top, title,directorandStar, rate, date, country, geners, comCount, quote))base='D://豆瓣top250'+"//"+str(Top)path = base +"//"+ title+".txt"with open(path, "w", newline="", encoding="utf_8_sig") as f:f.write(datas)Top += 1# 翻页def nextPage():for i in range(0, 250, 25):url = '/top250?start={}&filter='.format(i)getInfo(url)#创建txtif __name__ == '__main__':# 给文件加头部标题with open('D://豆瓣top250//包含内容.txt', "w", newline="", encoding="utf_8_sig") as f:f.write("Top,title,directorandStar,rate,date,country,geners,comCount,quote"+"\n")nextPage()#创建jpgif __name__ == '__main__':i = 0urls = ['/top250?start='+str(n)+'&filter=' for n in range(0,250,25)]for url in urls:r = getHTMLText(url)soup = BeautifulSoup(r,'html.parser')titles = soup.select('div.hd a')rates = soup.select('span.rating_num')pics = soup.select('img[width="100"]')for title,rate,pic in zip(titles,rates,pics):data={'title':list(title.stripped_strings),'rate':rate.get_text(),'pic':pic.get('src')}i+=1middlename = str(i)+"//"fileName=data['title'][0]+'.jpg'pic1 = requests.get(data['pic'])with open("D://豆瓣top250//"+middlename+fileName,'wb') as photo:photo.write(pic1.content)#shell显示进度print("图片"+str(i)+"下载完成")

本内容不代表本网观点和政治立场,如有侵犯你的权益请联系我们处理。
网友评论
网友评论仅供其表达个人看法,并不表明网站立场。