300字范文,内容丰富有趣,生活中的好帮手!
300字范文 > python爬虫 爬取豆瓣电影《芳华》电影短评 分词生成云图。

python爬虫 爬取豆瓣电影《芳华》电影短评 分词生成云图。

时间:2024-07-28 13:10:36

相关推荐

python爬虫 爬取豆瓣电影《芳华》电影短评 分词生成云图。

项目github地址:/kocor01/spider_cloub/

Python版本为3.6

最近突然想玩玩云图,动手写了个简单的爬虫,搭建了简单的爬虫架构

爬虫爬取最近比较火的电影《芳华》分词后生成云图

使用了 jieba分词,云图用wordcloud生成

用了朋友的2B姿势的自拍照片简单的P了下(为了不暴露,P成全黑的),作为生成云图的底图模板

云图底图模板:

生成的云图效果:

爬虫基础框架

spider_main.py 爬虫入口

url_manager.py URL管理器

html_downloader.py 网页下载器

html_parser.py 数据提取器

html_outputer.py 数据处理器

word_cloud.py 云图生成器

extra_dict文件夹文件如下:

li.png 云图底图模板

simhei.ttf 生成云图的字体文件

str.txt 爬取的电影短评

stop_words.txt 分词排除的词

cut_str.txt jieba分词后文件

yun.png 最后生成的云图

代码如下:

spider_main.py 爬虫入口

#coding:utf-8import url_manager,html_parser,html_outputer,html_downloader,word_cloudclass SpiderMain(object):def __init__(self):# URL管理器self.urls = url_manager.UrlManager()# 网页下载器self.downloader = html_downloader.HtmlDownloader()# 数据提取器self.parser = html_parser.HtmlParser()# 数据处理器self.outputer = html_outputer.HtmlOutputer()# 云图生成器self.cloud = word_cloud.Wordcloud()def craw(self, root_url):count =1# 爬虫入口URLself.urls.add_new_url(root_url)# 待爬取URLwait_url = self.urls.has_new_url()if wait_url is not None:while wait_url:try:# 获取一个待爬取URLnew_url = self.urls.get_new_url()print("carw %d : %s" % (count, new_url))# 爬取页面html_cont = self.downloader.download(new_url)# 数据提取new_url, new_datas = self.parser.parser(new_url, html_cont)# 添加新待爬取URLself.urls.add_new_url(new_url)# 数据加工处理self.outputer.collect_data(new_datas)# 爬虫循环控制if count == 10:breakcount = count + 1except:print("craw failed")# 数据加工输出self.outputer.process_data()#print("finish")# 分词self.outputer.cut_str()# 生成云图self.cloud.make()print("finish")if __name__ == "__main__":# 爬虫入口URLroot_url = "/subject/26862829/comments?status=P"obj_spider = SpiderMain()# 启动爬虫obj_spider.craw(root_url)

url_manager.py URL管理器

#coding:utf-8class UrlManager(object):def __init__(self):self.new_urls = set()self.old_urls = set()def add_new_url(self, url):if url is None:returnif url not in self.new_urls and url not in self.old_urls:self.new_urls.add(url)def add_new_urls(self, urls):if urls is None or len(urls) == 0:returnfor url in urls:self.add_new_url(url)def has_new_url(self):return len(self.new_urls) != 0def get_new_url(self):new_url = self.new_urls.pop()self.old_urls.add(new_url)return new_url

html_downloader.py 网页下载器

#coding:utf-8import urllib.requestclass HtmlDownloader(object):def download(self, url):if url is None:return Nonerequest = urllib.request.Request(url)request.add_header("user-agent", "Mozilla/5.0")response = urllib.request.urlopen(url)if response.getcode() != 200:return Nonereturn response.read()

html_parser.py 数据提取器

#coding:utf-8import http.cookiejarfrom bs4 import BeautifulSoupimport reimport urllib.parseclass HtmlParser(object):def parser(self, page_url, content):if page_url is None or content is None:returnsoup = BeautifulSoup(content, "html.parser", from_encoding='utf-8')new_url = self._get_new_url(page_url, soup)new_datas = self._get_new_datas(page_url, soup)return new_url, new_datasdef _get_new_url(self, page_url, soup):new_url = soup.find('div', id="paginator").find('a', class_="next").get('href')new_full_url = urllib.parse.urljoin(page_url, new_url)return new_full_urldef _get_new_datas(self, page_url, soup):res_datas = set()contents = soup.find_all('div', class_="comment-item")for content in contents:res_datas.add(content.find('div', class_="comment").find('p').get_text())return res_datas

html_outputer.py 数据处理器

#coding:utf-8import pymysqlimport jieba.analyseclass HtmlOutputer(object):def __init__(self):self.datas = []def collect_data(self, data):res_datas = set()if data is None:returnfor d in data:self.datas.append(d)def process_data(self):#print(len(self.datas))file_object = open('./extra_dict/str.txt', 'w',encoding='utf-8',errors='ignore')data_str = ''for data in self.datas:#data_str += datafile_object.write(data)#print(data_str)file_object.close()def cut_str(self):content = open('./extra_dict/str.txt',encoding='utf-8',errors='ignore').read()jieba.analyse.set_stop_words("./extra_dict/stop_words.txt")tags = jieba.analyse.extract_tags(content, topK=1000,withWeight=True)file_object = open('./extra_dict/cut_str.txt', 'w') for v, n in tags:#权重是小数,为了凑整,乘了一万#print(v + '\t' + str(int(n * 10000)))data_str = v + '\t' + str(int(n * 10000)) + '\n'file_object.write(data_str)file_object.close()

word_cloud.py 云图生成器

from os import pathfrom PIL import Imageimport numpy as npimport matplotlib.pyplot as pltfrom wordcloud import WordCloud, STOPWORDS, ImageColorGeneratorclass Wordcloud(object):def make(self):d = path.dirname(__file__)# Read the whole text.text = open(path.join(d, './extra_dict/cut_str.txt')).read()# read the mask / color image taken fromalice_coloring = np.array(Image.open(path.join(d, "./extra_dict/li.png")))stopwords = set(STOPWORDS)stopwords.add("said")wc = WordCloud(font_path="./extra_dict/simhei.ttf",background_color="white", max_words=2000, mask=alice_coloring,stopwords=stopwords, max_font_size=40, random_state=42)# generate word cloudwc.generate(text)# create coloring from imageimage_colors = ImageColorGenerator(alice_coloring)# showplt.imshow(wc, interpolation="bilinear")plt.axis("off")plt.figure()# recolor wordcloud and show# we could also give color_func=image_colors directly in the constructorplt.imshow(wc.recolor(color_func=image_colors), interpolation="bilinear")plt.axis("off")plt.figure()plt.imshow(alice_coloring, cmap=plt.cm.gray, interpolation="bilinear")plt.axis("off")wc.to_file(path.join(d, "./extra_dict/yun.png"))plt.show()

本内容不代表本网观点和政治立场,如有侵犯你的权益请联系我们处理。
网友评论
网友评论仅供其表达个人看法,并不表明网站立场。