300字范文 > python爬虫实战：《星球大战》豆瓣影评分析

python爬虫实战：《星球大战》豆瓣影评分析

时间：2021-05-18 09:15:51

#################更新于.2.2.彻底搞定小问题。开心############################

'''

Windows 7 系统

Sublime text 编辑器

Python3.5.3

'''

from urllib import request#request 是抓取网页数据的库from bs4 import BeautifulSoup as bs#beautifulsoup库对html代码进行解析import re #引入正则表达式import jieba#分词包库jieba，可以将中文语句拆解成一个个的词汇。import pandas as pd import matplotlib.pyplot as pltimport matplotlibimport numpy #numpy计算包from wordcloud import WordCloud #词云包#打开星球大战的主页，爬取HTML文件，发现评论在<div class="comment">下面，对这个标签进行解析requrl = '/subject/' + '25808075' + '/comments' + '?' +'status=P'resp = request.urlopen(requrl)html_data = resp.read().decode('utf-8')soup = bs(html_data, 'html.parser')comment_div_lists = soup.find_all('div', class_ = 'comment')eachCommentList = []for item in comment_div_lists:if item.find_all('p')[0].string is not None:#p标签下面存放了网友对电影的评论eachCommentList.append(item.find_all('p')[0].string)#print(eachCommentList)#进行数据清洗#为了方便数据清洗，我们把列表中的内容放在一个字符串中comments = ''for k in range(len(eachCommentList)):comments += (str(eachCommentList[k])).strip()#print(comments)#清除数据中的标点符号。正则表达式。短小精悍的一个模式[\u4e00-\u9fa5]+即可匹配。将非中文字符彻底清理pattern = pile(r'[\u4e00-\u9fa5]+')filterdata = re.findall(pattern, comments)cleaned_comments = ''.join(filterdata) #放入一个字符串中，成为一个字符串#print(cleaned_comments)#这里是用的是lcut()方法，能将中文字符串拆解成一个列表，每项都是一个词。segment = jieba.lcut(cleaned_comments)#print(segment)#处理词汇的聚合问题，统计词频而已words_df = pd.DataFrame({'segment':segment})#print(words_df.head())#查看segment和words_df的内容不是words_df.head()内容#去掉其中的高频词，没意义的词语，看”、“太”、“的”等虚词（停用词）。由于这些词汇中，有很多词是没有实际分析价值的，所以我们需要利用一个停词文件来将不必要的词处理掉。stopwords = pd.read_csv("D:\ST\Python_work\stopwords.txt", index_col = False, quoting = 3, sep = "\t", names = ['stopword'], encoding = 'utf-8')words_df = words_df[~words_df.segment.isin(stopwords.stopword)]#print(words_df)#统计词频words_stat = words_df.groupby(by = ['segment'])['segment'].agg({"计数":numpy.size})words_stat = words_stat.reset_index().sort_values(by = ["计数"], ascending = False)#print(words_stat)#第三阶段：用词云显示效果，simhei.ttf字符格式，类似宋体之类的wordcloud = WordCloud(font_path = "simhei.ttf", background_color = "white", max_font_size = 80)#设置字体属性#word_frequence 为字典类型，可以直接传入wordcloud.fit_words()word_frequence = {x[0]:x[1] for x in words_stat.head(1000).values}wordcloud = wordcloud.fit_words(word_frequence)plt.imshow(wordcloud)plt.axis("off")plt.show()#原文来自https://mp./s/ukf-rormz5VnDgqa6uswJQ

#############################以下是旧版本，有问题，供参考，写于.12.01######################

#这个文档主要是我根据公众号文章一步步分析写出来的，但是还有问题，无法得到云图。

#原文来自https://mp./s/ukf-rormz5VnDgqa6uswJQ

#具体问题是：#%matplotlib inline 无法搞定。

#据说需要用这个编辑器Jupyter Notebook来运行此程序，后来又报错，听说是版本问题。但依旧没有搞定，有能够借助我这篇文章搞定爬虫的可以联系我from urllib import request#request 是抓取网页数据的库from bs4 import BeautifulSoup as bs#beautifulsoup库对html代码进行解析import re #引入正则表达式import jieba#分词包import pandas as pd import matplotlib.pyplot as plt#%matplotlib inlineimport matplotlibimport numpy #numpy计算包matplotlib.rcParams['figure.figsize'] = (10.0, 5.0)from wordcloud import WordCloud #词云包'''#第一步要对网页进行访问，python中使用的是urllib库file1 = 'zhanlang.txt'resp = request.urlopen('/cinema/nowplaying/shenzhen/')html_data = resp.read().decode('utf-8')#print(html_data)#第二步，需要对得到的html代码进行解析，得到里面提取我们需要的数据。soup = bs(html_data, 'html.parser')nowplaying_movie = soup.find_all('div', id = 'nowplaying')#div nowplaying 是我们需要的数据，里面有电影名称等数据nowplaying_movie_list = nowplaying_movie[0].find_all('li', class_ = 'list-item')#print(nowplaying_movie_list)#循环得到电影ID和名字。data-subject 里面是电影ID, img 标签alt属性里面放了电影名称nowplaying_list = []for item in nowplaying_movie_list:nowplaying_dict = {}nowplaying_dict['id'] = item['data-subject']for tag_imag_item in item.find_all('img'):nowplaying_dict['name'] = tag_imag_item['alt']nowplaying_list.append(nowplaying_dict)#print(nowplaying_list)'''#+ nowplaying_list[0]['id']#打开星球大战的主页，爬取HTML文件，发现评论在<div class="comment">下面，对这个标签进行解析requrl = '/subject/' + '25808075' + '/comments' + '?' +'status=P'resp = request.urlopen(requrl)html_data = resp.read().decode('utf-8')soup = bs(html_data, 'html.parser')comment_div_lists = soup.find_all('div', class_ = 'comment')eachCommentList = []for item in comment_div_lists:if item.find_all('p')[0].string is not None:#p标签下面存放了网友对电影的评论eachCommentList.append(item.find_all('p')[0].string)#print(eachCommentList)#进行数据清洗#为了方便数据清洗，我们把列表中的内容放在一个字符串中comments = ''for k in range(len(eachCommentList)):comments += (str(eachCommentList[k])).strip()#print(comments)#清除数据中的标点符号pattern = pile(r'[\u4e00-\u9fa5]+')filterdata = re.findall(pattern, comments)cleaned_comments = ''.join(filterdata)#print(cleaned_comments)#对产生的没有标点的评论进行词频统计，采用结巴分词，jiebasegment = jieba.lcut(cleaned_comments)words_df = pd.DataFrame({'segment':segment})#print(words_df.head())#去掉其中的高频词，没意义的词语，看”、“太”、“的”等虚词（停用词）stopwords = pd.read_csv("stopwords.txt", index_col = False, quoting = 3, sep = "\t", names = ['stopword'], encoding = 'utf-8')words_df = words_df[~words_df.segment.isin(stopwords.stopword)]#print(words_df)#统计词频words_stat = words_df.groupby(by = ['segment'])['segment'].agg({"计数":numpy.size})words_stat = words_stat.reset_index().sort_values(by = ["计数"], ascending = False)#print(words_stat)#第三阶段：用词云显示效果wordcloud = WordCloud(font_path = "simhei.ttf", background_color = "white", max_font_size = 80)#设置字体属性word_frequence = {x[0]:x[1] for x in words_stat.head(1000).values}word_frequence_list = []for key in word_frequence:temp = (key, word_frequence[key])word_frequence_list.append(temp)wordcloud = wordcloud.fit_words(word_frequence_list)plt.imshow(wordcloud)'''Hi，借群的力量请教一个Python爬虫问题#第三阶段：用词云显示效果wordcloud = WordCloud(font_path = "simhei.ttf", background_color = "white", max_font_size = 80)#设置字体属性word_frequence = {x[0]:x[1] for x in words_stat.head(1000).values}word_frequence_list = []for key in word_frequence:temp = (key, word_frequence[key])word_frequence_list.append(temp)wordcloud = wordcloud.fit_words(word_frequence_list)plt.imshow(wordcloud)这句wordcloud = wordcloud.fit_words(word_frequence_list)出了问题AttributeError: 'list' object has no attribute 'items'原文来自https://mp./s/ukf-rormz5VnDgqa6uswJQ

本内容不代表本网观点和政治立场，如有侵犯你的权益请联系我们处理。

网友评论

网友评论仅供其表达个人看法，并不表明网站立场。