300字范文 > Python爬虫 | 爬取微博和哔哩哔哩数据

Python爬虫 | 爬取微博和哔哩哔哩数据

时间：2018-07-13 03:51:04

一、bill_comment.py

二、bili_comment_pic.py

三、bilibili.py

四、bilihot_pic.py

五、bilisearch_pic.py

六、draw_cloud.py

七、weibo.py

八、weibo_comment.py

九、weibo_comment_pic.py

十、weibo_pic.py

十一、weibo_top.py

十二、weibo_top_pic.py

十三、weibo_top_pie.py

十四、pachong.py

十五、代码文件说明

一、bill_comment.py

import requests# 发送请求import pandas as pd#保存csv文件import os # 判断文件是否存在import timefrom time import sleep# 设置等待，防止反爬import jsonimport random# 生成随机数import os.pathimport requestsimport csvimport reimport bili_comment_picdef trans_date(v_timestamp):""""10位时间戳转换为时间字符串"""timeArray=time.localtime(v_timestamp)otherStyleTime = time.strftime("%Y-%m-%d %H: %M:%S", timeArray)return otherStyleTimedef getoid(bv):resp=requests.get("/video/"+bv)obj=pile(f'"aid":(?P<id>.*?),"bvid":"{bv}"')#在网页源代码里可以找到id，用正则获取到oid=obj.search(resp.text).group('id')print('oid是'+oid) #在程序运行时告诉我们已经获取到了参数oidreturn oiddef get_bili_comment(bv_list,max_page):for bvid in bv_list:#保存文件名bili_file='biliComment_{}pages_{}.csv'.format(max_page,bvid)#如果csv存在，先删除if os.path.exists(bili_file):os.remove(bili_file)print('存在，已删除：{}'.format(bili_file))## # 请求头# headers = {# 'Authority':'',# 'Accept':'application/json, text/plain, */*',# 'Accept-Encoding':'gzip, deflate, br',# 'Accept-Language':'zh-CN,zh;q=0.9',# #需要定期更换cookie# 'Cookie':# 'buvid3=09193776-D54E-C4E9-D77E-A3CEC61048A052609infoc; b_nut=1666432252; i-wanna-go-back=-1; b_ut=7; _uuid=9837E983-2521-B3D3-E815-AF3877BF973253126infoc; buvid_fp=bca1b3ca8709dc8fafd31a3014e880cb; nostalgia_conf=-1; PVID=1; CURRENT_FNVAL=4048; rpdid=0z9ZwfQgnR|lkoRrAma|2ss|3w1Q0AxQ; sid=73446m9u; buvid4=FFE4C4F3-FFE7-4A1B-F2E9-BA77F904B1B753643-022102217-RoU6Io6eaXN5hT%2FTDpMpDggrSpyQiYXaOp1a506ie3QU%2FFwMxK3Zhw%3D%3D; b_lsid=E6E6D472_1883D6194B0',# 'Origin':'',# 'Referer':'/video/BV1zh4y1H7ZS/?spm_id_from=333.999.0.0&vd_source=7dd889e8bc19f867cf9a8b6d62c711ee',# 'Sec-Ch-Ua':'"Google Chrome";v="113", "Chromium";v="113", "Not-A.Brand";v="24"',# 'Sec-Ch-Ua-Mobile':'?0',# 'Sec-Ch-Ua-Platform':'"macOS"',# 'Sec-Fetch-Dest':'empty',# 'Sec-Fetch-Mode':'cors',# 'Sec-Fetch-Site':'same-site',# 'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36'## }# # 更简单的网页头headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36","referer": "/"}for page in range(1,max_page + 1):#请求参数params = {'jsonp':'jsonp','mode': '3',#mode=3代表按热门排序，mode=2代表按时间排序'oid': getoid(bvid),'next':page,'type': '1',}# type:评论类型,这里固定值1# oid: 哪个视频# pn: 第几页的评论# sort: 排序。0: 按照时间排序。2：按照热度排序。默认2url = (f"/x/v2/reply/main") # 获得网页源码response = requests.get(url, headers=headers,params=params,)print(response.status_code)data_list=response.json()['data']['replies']#解析评论数据comment_list=[]#评论内容空列表time_list=[]#评论时间空列表#location_list=[]#评论IP空列表user_list=[]#评论用户名空列表like_list=[]#评论点赞数空列表replyCount_list=[]#评论回复数空列表userid_list=[]#评论用户id空列表#循环爬取每一条评论数据for a in data_list:#评论内容comment=a['content']['message']comment_list.append(comment)#评论时间time=a['ctime']time_list.append(trans_date(time))#time_list.append(trans_date(v_str=i) for i in range(time))# #IP属地(评论后一段时间会消失，所以不爬了)# location = a['source']# location_list.append(location)#评论回复数replyCount = a['rcount']replyCount_list.append(replyCount)#点赞数like = a['like']like_list.append(like)# 评论用户名user = a['member']['uname']user_list.append(user)# 评论用户名userid = a['member']['mid']userid_list.append(userid)#把列表拼接为dataFrame数据df=pd.DataFrame({#'视频链接':'/video/'+v_bid,'评论页码':page,'评论时间':time_list,'评论作者':user_list,'评论id': userid_list,#'IP属地':location_list,'点赞数':like_list,'评论回复数':replyCount_list,'评论内容':comment_list,})# 表头if os.path.exists(bili_file):header = Noneelse:header = ['评论页码','评论时间', '评论作者', '评论id', '点赞数', '评论回复数', '评论内容']column=['评论页码','评论时间', '评论作者', '评论id', '点赞数', '评论回复数', '评论内容']# 保存到csv文件df.to_csv(bili_file, mode='a+', index=False, columns=column,header=header, encoding='utf-8-sig')#print('csv保存成功：{}'.format(bili_file))print('第{}页爬取完成'.format(page))#print(df)# 数据清洗、去重df = pd.read_csv(bili_file, engine='python', encoding='utf-8-sig')os.remove(bili_file)# 删除重复数据df.drop_duplicates(subset='评论内容', inplace=True, keep='first')# 再次保存csv文件column=header = ['评论页码', '评论时间', '评论作者', '评论id', '点赞数', '评论回复数', '评论内容']df.to_csv(bili_file, mode='a+', index=False, columns=column,header=header, encoding='utf-8-sig')print('数据清洗完成')bili_comment_pic.main(bili_file)if __name__=='__main__':#视频bv号,循环爬取多个视频评论#bv_list=['BV1Ss4y1M7KT','BV1VM411N7qc']bv_list = [str(x) for x in input("请输入视频bv号(示例：BV1Ss4y1M7KT,BV1VM411N7qc),以逗号分隔：").split(',')]#最大爬取页max_page=int(input("请输入搜索的页数"))#调用爬取get_bili_comment(bv_list=bv_list,max_page=max_page)

二、bili_comment_pic.py

# 允许副本存在，忽略报错import osimport pandas as pdimport matplotlib.pyplot as pltfrom matplotlib import font_managerimport numpy as npos.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"def view(info,bili_file):my_font = font_manager.FontProperties(fname='./STHeiti-TC-Medium.ttf') # 设置中文字体（图标中能显示中文）likes = info['点赞数'] # 点赞reply = info['评论回复数'] # 回复comment = info['评论内容'] # 内容# print(comment)# 为了坐标轴上能显示中文plt.rcParams['font.sans-serif'] = ['SimHei']plt.rcParams['axes.unicode_minus'] = False# **********************************************************************综合评分和播放量对比# *******点赞数条形图fig, ax1 = plt.subplots()length = len(comment)plt.bar(x=np.arange(length), tick_label=comment, height=likes, color='red') # 设置柱状图plt.title('点赞数和评论数数据分析', fontproperties=my_font) # 表标题ax1.tick_params(labelsize=6)plt.xlabel('评论内容') # 横轴名plt.ylabel('点赞数') # 纵轴名plt.xticks(rotation=90, color='green') # 设置横坐标变量名旋转度数和颜色# *******评论数折线图ax2 = ax1.twinx() # 组合图必须加这个ax2.plot(reply, color='cyan') # 设置线粗细，节点样式plt.ylabel('评论数') # y轴plt.plot(1, label='点赞数', color="red", linewidth=5.0) # 图例#plt.plot(1, label='评论回复数', color="cyan", linewidth=1.0, linestyle="-") # 图例plt.legend()plt.savefig('.\图片\pic-{}.png'.format(bili_file), dpi=1000, bbox_inches='tight') # 保存至本地plt.show()def main(bili_file):info = pd.read_csv(bili_file,engine='python', encoding='utf-8-sig')info=info.nlargest(60,'点赞数')info=info.reset_index(drop=True)view(info,bili_file)if __name__ == '__main__':main('biliComment_15pages_BV1Ss4y1M7KT.csv')

三、bilibili.py

import requestsfrom urllib.parse import quoteimport jsonimport timefrom time import sleepimport pandas as pdimport hashlibimport bilihot_picimport bilisearch_pic"""bilisearch类的需求功能1.初始化需要输入参数search:你需要搜索的数据page:需要查看的页数2.使用方法a = blisearch(serch,page) 初始化类a.findall() 将爬取的数据存入excel文件中"""class bilisearch():# 第一个输入的参数是搜索数据，第二个是搜素页数def __init__(self, search, page):# 对输入进行编码self.search = searchself.searchurl = '&keyword=' + quote(search, 'utf-8')# 构造浏览器访问请求头# 大概是一定要cookie才能访问的测试一下cookie过段时间还能不能访问self.head = {'authority': '','user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.44','Referer': "/all?from_source=webtop_search&spm_id_from=333.1007&search_source=5keyword=",'referer': '/','cookie': 'buvid3=05746C34-6526-44A7-9132-4C0A7180E63C148796infoc; LIVE_BUVID=AUTO4216287558369376; i-wanna-go-back=-1; CURRENT_BLACKGAP=0; buvid4=CE2658E1-DE0F-1555-42F9-BBE8E7E701B973047-02116-NXuDwzBl0l7IPmxDzx269g%3D%3D; buvid_fp_plain=undefined; blackside_state=0; is--channel=1; _uuid=136F106D6-AA102-198A-C5DD-7351A72CFDE849203infoc; b_nut=100; rpdid=0zbfvWJdeE|54lJB1MA|2Ln|3w1OVksf; CURRENT_QUALITY=80; hit-new-style-dyn=1; CURRENT_PID=b98a29b0-cd2f-11ed-9194-494fac97dd7c; fingerprint=5050e9471226aa5c2be3ac56100522f8; header_theme_version=CLOSE; nostalgia_conf=-1; hit-dyn-v2=1; home_feed_column=5; CURRENT_FNVAL=4048; bp_video_offset_329341133=781400043392336000; SESSDATA=0948d8e9%2C1696396399%2Cef62d%2A42; bili_jct=cb7a5dbbd0153907fff4b713334d6833; DedeUserID=329341133; DedeUserID__ckMd5=acfa5c750e5b3e7f; PVID=1; b_ut=5; innersign=0; b_lsid=7C37E147_1875B2E5B1D; bsource=search_bing; buvid_fp=5050e9471226aa5c2be3ac56100522f8'}# 需要爬取的页数self.page = page# 保存的数据# self.data=[]def dataProcess(self, data):# 存入csv的数据集storedata = []# 每一页的数据量是30个for i in range(30):if (data[i]['type'] == 'picture_ad_0'):continue# 作者author = data[i]['author']# 标题替换<em class="keyword"> </em>title = data[i]['title'].replace('<em class="keyword">', '').replace('</em>', '')# 播放量play = data[i]['play']# 简介description = data[i]['description']# 封面pic = data[i]['pic']# 播放地址arcurl = data[i]['arcurl']# idid = data[i]['id']# 时间pubdate = data[i]['pubdate']# 10位时间戳转换为时间字符串timeArray = time.localtime(pubdate)pubdate = time.strftime("%Y-%m-%d %H: %M:%S", timeArray)# 将数据以字典的格式存入data序列中# self.data.append({'author':author,'title':title,'play':play,'description':description,'pic':pic,'arcurl':arcurl,'id':id})storedata.append([author, title, play, description, pic, arcurl, id, pubdate])return storedatadef reverse(self, page):timenow = int(time.time())if (page == 1):an = f'refresh=true&_extra=&ad_resource=5646&context=&duration=&from_source=&from_spmid=333.337&highlight=1&keyword={self.search}&order=&page=1&page_size=42&platform=pc&qv_id=EfNJjEtrA0N5DxzPVKch7Kz6v33ezlFR&single_column=0&source_tag=3&web_location=1430654&wts={timenow}'wt = '55540207d820a7368ab7e104169d409d'data = an + wtmd = hashlib.md5(data.encode('UTF-8'))return md.hexdigest(), timenowelse:an = f'refresh=true&_extra=&ad_resource=5654&category_id=&context=&dynamic_offset={str((page - 1) * 30)}&from_source=&from_spmid=333.337&gaia_vtoken=&highlight=1&keyword={self.search}&page={page}&page_size=42&platform=pc&qv_id=hJgZIEUY51fw9Pp7s8pidIVEJ7Z08KaS&search_type=video&single_column=0&source_tag=3&web_location=1430654&wts={timenow}'wt = '55540207d820a7368ab7e104169d409d'data = an + wtmd = hashlib.md5(data.encode('UTF-8'))return md.hexdigest(), timenow# 综合排序def findall(self):for pnum in range(1, int(self.page) + 1):# 拼接关键字，请求数据w_rid, timenow = self.reverse(pnum)if (pnum == 1):target = requests.get(f'/x/web-interface/wbi/search/all/v2?__refresh__=true&_extra=&context=&page={pnum}&page_size=42&order=&duration=&from_source=&from_spmid=333.337&platform=pc&highlight=1&single_column=0&keyword={self.search}&qv_id=noyCOTfEBm8ZzMVGopKgzYbiqLFxoAn1&ad_resource=5646&source_tag=3&web_location=1430654&w_rid={w_rid}&wts={timenow}',headers=self.head)else:target = requests.get(f'/x/web-interface/wbi/search/all/v2?refresh=true&_extra=&ad_resource=5654&category_id=&context=&dynamic_offset={(pnum - 1) * 30}&from_source=&from_spmid=333.337&gaia_vtoken=&highlight=1&keyword={self.search}&page={pnum}&page_size=42&platform=pc&qv_id=hJgZIEUY51fw9Pp7s8pidIVEJ7Z08KaS&search_type=video&single_column=0&source_tag=3&web_location=1430654&w_rid={w_rid}&wts={timenow}',headers=self.head)# 将数据转换为py对象data = json.loads(target.text)# 存入csv的数据集storedata = self.dataProcess(data['data']['result'][10]['data'])print('第', pnum, '页完成')# 调用storeCsvdataself.storeCsvdata('b站清单_' + str(self.search) + '_第' + str(pnum) + '页.csv', storedata, pnum)# 设置等待1ssleep(1)# 写入文件模块def storeCsvdata(self, filename, storedata, pagenum):with open(filename, 'a+') as fp:# 构造列表头name = ['作者', '标题', '播放量', '简介', '封面', '播放地址', 'id', '时间']# 写入文件writer = pd.DataFrame(storedata, columns=name)writer.to_csv(filename, index=False, encoding='utf-8-sig')bilisearch_pic.main(filename)fp.close()"""bilihot类的功能1.初始化需要的参数无2.使用方法a = bilihot() 初始化a.findall() 调用搜索a.storeCsvdata() 储存数据a.data 可以查看数据 a.data[i][j] i为第几个数据集合 j为['作者','标题','播放量','简介','封面','id','播放地址','时间','分区']"""class bilihot():def __init__(self):# 构造浏览器访问请求头self.head = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.44','Referer': "/all?from_source=webtop_search&spm_id_from=333.1007&search_source=5keyword=",'referer': '/v/popular/rank/all','authority': '',}# 保存一份数据self.data = []def findall(self):# 请求数据target = requests.get('/x/web-interface/ranking/v2?rid=0&type=all', headers=self.head)# 将数据转换为py对象data = json.loads(target.text)for i in data['data']['list']:# 作者author = i['owner']['name']# 标题title = i['title']# 播放量play = i['stat']['view']# 简介desc = i['desc']# 封面pic = i['pic']# idid = i['aid']# 播放地址arcurl = i['short_link_v2']# 发布日期pubdate = i['pubdate']# 10位时间戳转换为时间字符串timeArray = time.localtime(pubdate)pubdate = time.strftime("%Y-%m-%d %H: %M:%S", timeArray)# 分区tname = i['tname']self.data.append([author, title, play, desc, pic, id, arcurl, pubdate, tname])print('请求数据成功')def storeCsvdata(self):with open('b站排行榜.csv', 'a+') as fp:# 构造列表头name = ['作者', '标题', '播放量', '简介', '封面', 'id', '播放地址', '时间', '分区']# 写入文件writer = pd.DataFrame(self.data, columns=name)writer.to_csv('b站排行榜.csv', index=False, encoding='utf-8-sig')print('写入成功')bilihot_pic.main('b站排行榜.csv')fp.close()if __name__ == '__main__':# search: 你需要搜索的数据search = input("请输入搜索的关键词")# page: 需要查看的页数page = int(input("请输入搜索的页数"))# 初始化类a = bilisearch(search, page)# 将爬取的数据存入excel文件中a.findall()# 初始化b = bilihot()# 调用搜索b.findall()# 储存数据b.storeCsvdata()

四、bilihot_pic.py

import pandas as pdimport matplotlib.pyplot as pltfrom matplotlib import font_managerimport numpy as npdef view(info,bili_file):# 设置中文字体（图标中能显示中文）my_font = font_manager.FontProperties(fname='./STHeiti-TC-Medium.ttf')# 为了坐标轴上能显示中文plt.rcParams['font.sans-serif'] = ['SimHei']plt.rcParams['axes.unicode_minus'] = Falsetitle = info['标题']views = info['播放量']# *******播放量条形图fig, ax1 = plt.subplots()length = len(title)plt.barh(y=np.arange(length), tick_label=title, width=views, color='cyan') # 设置柱状图plt.title('标题和播放量的数据分析', fontproperties=my_font) # 表标题ax1.tick_params(labelsize=6)plt.xlabel('播放量') # 横轴名plt.ylabel('标题') # 纵轴名plt.yticks(color='green') # 设置横坐标变量名旋转度数和颜色plt.plot(1, label='播放量', color="cyan", linewidth=5.0) # 图例plt.legend()plt.savefig('.\图片\pic-{}.png'.format(bili_file), dpi=1000, bbox_inches='tight') # 保存至本地plt.show()def main(bili_file):info = pd.read_csv(bili_file,engine='python', encoding='utf-8-sig')info = info.nlargest(50, '播放量')info = info.sort_values('播放量', ascending=True)view(info,bili_file)if __name__ == '__main__':main('b站排行榜.csv')

五、bilisearch_pic.py

import pandas as pdimport matplotlib.pyplot as pltfrom matplotlib import font_managerimport numpy as npdef view(info,bili_file):# 设置中文字体（图标中能显示中文）my_font = font_manager.FontProperties(fname='./STHeiti-TC-Medium.ttf')# 为了坐标轴上能显示中文plt.rcParams['font.sans-serif'] = ['SimHei']plt.rcParams['axes.unicode_minus'] = Falsetitle = info['标题']views = info['播放量']# *******播放量条形图fig, ax1 = plt.subplots()length = len(title)plt.barh(y=np.arange(length), tick_label=title, width=views, color='green') # 设置柱状图plt.title('标题和播放量的数据分析', fontproperties=my_font) # 表标题ax1.tick_params(labelsize=6)plt.xlabel('播放量') # 横轴名plt.ylabel('标题') # 纵轴名plt.yticks(color='blue') # 设置纵坐标变量名颜色plt.plot(1, label='播放量', color="green", linewidth=5.0) # 图例plt.legend()plt.savefig('.\图片\pic-{}.png'.format(bili_file), dpi=1000, bbox_inches='tight') # 保存至本地plt.show()def main(bili_file):info = pd.read_csv(bili_file,engine='python', encoding='utf-8-sig')info = info.sort_values('播放量', ascending=True)view(info,bili_file)if __name__ == '__main__':main('b站清单_疫情_第1页.csv')

六、draw_cloud.py

import numpy as npimport pandas as pdfrom wordcloud import WordCloud, ImageColorGeneratorfrom PIL import Imagedef draw_cloud(weibo_file):image = Image.open('.\\background.jpg') # 作为背景轮廓图graph = np.array(image)# 参数分别是指定字体、背景颜色、最大的词的大小、使用给定图作为背景形状wc = WordCloud(font_path='msyh.ttc',background_color='white',max_words=100, mask=graph)fp = pd.read_csv(weibo_file,engine='python', encoding='utf-8-sig') # 读取词频文件name = list(fp['热搜内容']) # 词value = fp['热搜热度'] # 词的频率for i in range(len(name)):name[i] = str(name[i])dic = dict(zip(name, value)) # 词频以字典形式存储print(dic)wc.generate_from_frequencies(dic) # 根据给定词频生成词云image_color = ImageColorGenerator(graph)#生成词云的颜色wc.to_file('.\图片\draw_cloud-{}.png'.format(weibo_file)) # 图片命名if __name__ == '__main__':draw_cloud('微博top_fun.csv')

七、weibo.py

import os.pathimport refrom jsonpath import jsonpathimport requestsimport pandas as pdimport datetimefrom fake_useragent import UserAgentimport weibo_picdef trans_time(v_str):"""转换GMT时间为标准格式"""GMT_FORMAT='%a %b %d %H:%M:%S +0800 %Y'timearray=datetime.datetime.strptime(v_str,GMT_FORMAT)ret_time=timearray.strftime("%Y-%m-%d %H:%M:%S")return ret_timedef get_weibo_list(v_keyword,v_max_page):"""爬取微博内容列表:param v_keyword: 搜索关键字:param v_max_page: 爬取前几页:return: None"""# 保存文件名v_weibo_file = '微博清单_{}_前{}页.csv'.format(v_keyword,v_max_page)# 如果csv存在，先删除if os.path.exists(v_weibo_file):os.remove(v_weibo_file)print('微博清单存在，已删除：{}'.format(v_weibo_file))for page in range(1,v_max_page+1):print('===开始爬取第{}页微博==='.format(page))# 请求头ua = UserAgent()headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36 Edg/113.0.1774.42","accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7","accept-encording": "gzip, deflate, br"}#请求地址url='/api/container/getIndex'#请求参数params={"containerid":"100103type=1&q={}".format(v_keyword),"page_type":"searchall","page":page}#发送请求r=requests.get(url,headers=headers,params=params)print(r.status_code)#解析json数据cards=r.json()["data"]["cards"]#微博内容text_list=jsonpath(cards,'$..mblog.text')#微博内容-正则表达式数据清洗dr=pile(r'<[^>]+>',re.S)text2_list=[]print('text_list is:')print(text_list)if not text_list:#如果未获取到微博内容，则进入下一轮循环continueif type(text_list)==list and len (text_list)>0:for text in text_list:text2=dr.sub('',text)#正则表达式提取微博内容print(text2)text2_list.append(text2)#微博创建时间time_list = jsonpath(cards, '$..mblog.created_at')time_list=[trans_time(v_str=i) for i in time_list]#微博作者author_list = jsonpath(cards, '$..mblog.user.screen_name')#微博idid_list = jsonpath(cards, '$..mblog.user.id')# 微博bidbid_list = jsonpath(cards, '$..mblog.bid')# 转发数reposts_count_list = jsonpath(cards, '$..mblog.reposts_count')# 评论数comments_count_list = jsonpath(cards, '$..ments_count')# 点赞数attitudes_count_list = jsonpath(cards, '$..mblog.attitudes_count')df=pd.DataFrame({'页码':*len(id_list),'微博id':id_list,'微博bid': bid_list,'微博作者': author_list,'发布时间': time_list,'微博内容': text2_list,'转发数': reposts_count_list,'评论数': comments_count_list,'点赞数': attitudes_count_list})#表头if os.path.exists(v_weibo_file):header=Noneelse:header=['页码','微博id','微博bid','微博作者','发布时间','微博内容','转发数','评论数','点赞数']column=['页码','微博id','微博bid','微博作者','发布时间','微博内容','转发数','评论数','点赞数']#保存到csv文件df.to_csv(v_weibo_file,mode='a+',index=False,columns=column, header=header,encoding='utf-8-sig')print('csv保存成功：{}'.format(v_weibo_file))# 数据清洗、去重df = pd.read_csv(v_weibo_file, engine='python', encoding='utf-8-sig')os.remove(v_weibo_file)# 删除重复数据df.drop_duplicates(subset='微博bid', inplace=True, keep='first')# 再次保存csv文件header = ['页码','微博id','微博bid','微博作者','发布时间','微博内容','转发数','评论数','点赞数']column=headerdf.to_csv(v_weibo_file, mode='a+', index=False, columns=column, header=header,encoding='utf-8-sig')print('数据清洗完成')weibo_pic.main(v_weibo_file)if __name__=='__main__':# 爬取关键字search_keyword = input("请输入搜索的关键词")#爬取页数max_search_page=int(input("请输入搜索的页数"))#调用爬取微博函数get_weibo_list(v_keyword=search_keyword,v_max_page=max_search_page)

八、weibo_comment.py

import requests# 发送请求import pandas as pd#保存csv文件import os # 判断文件是否存在import datetimeimport timefrom time import sleep# 设置等待，防止反爬import jsonimport random# 生成随机数import os.pathimport requestsimport csvimport reimport weibo_comment_picdef trans_time(v_str):"""转换GMT时间为标准格式"""GMT_FORMAT='%a %b %d %H:%M:%S +0800 %Y'timearray=datetime.datetime.strptime(v_str,GMT_FORMAT)ret_time=timearray.strftime("%Y-%m-%d %H:%M:%S")return ret_timedef get_bili_comment(weiboID_list,max_page):for weibo_id in weiboID_list:#保存文件名wbComment_file='weiboComment_{}pages_{}.csv'.format(max_page,weibo_id)#如果csv存在，先删除if os.path.exists(wbComment_file):os.remove(wbComment_file)print('存在，已删除：{}'.format(wbComment_file))#请求头headers = {#不加cookie只能爬一页'cookie':'__bid_n=1883c7fc76e10d57174207; FPTOKEN=IBsER/uKazbtpMIEgvaOTfAuHsmYQM5g0VL9U1G3ybs72PsWHEBbiKv0w+R59BrOvSwxDKJevIDwL0SSwPV5yWd3lIFsx6KXQ/qYPpPTjTRW5kFr+j74rsScC6MKc1G9142e5tEEf7atvY/zTxl9B6jy/y7MEo0ETLT0VjL6nbpzkWe/SnIw97Tjb+9lqYoGHS6lPqZ5yAhDPKn0KK4htwxqr0qMglAG6ZcT7mn+BUZAygRSrqWZwZ6KSE0r27qsR0bDTAI8dsQFq1gPfYONp5UHfw9FFsBiscLULixqm31wTHYziK8gxi0/R6yIQ8Tq3OQkNmx+Kw7E/8YknGOiVmpjfRn5FNShZs3/t8SNBJEcZ9qaQnw/iF/jwPoFkMXz87Tp22aQUmFgeQu/u0wAYQ==|wC9ITrusKUtoBk6wTqvs+jaY6iwSJyX4pD0y+hSvnOA=|10|acf98643db3def55913fefef5034d5ee; WEIBOCN_FROM=1110106030; loginScene=10; SUB=_2A25JbkPWDeRhGeNH7FIV-SjKzjyIHXVqkW2erDV6PUJbkdAGLRbkkW1NSoXhCHcUhbni8gGXfjdc5HNqec9qABj_; MLOGIN=1; _T_WM=98495433469; XSRF-TOKEN=a62fb7; mweibo_short_token=9f0e28d6c9; M_WEIBOCN_PARAMS=oid%3D4903111417922777%26luicode%3D20000061%26lfid%3D4903111417922777%26uicode%3D20000061%26fid%3D4903111417922777',"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36",'X-Xsrf-Token':'a62fb7'}max_id = ''for page in range(1,max_page + 1):if page==1:#第一页没有max_id参数url='/comments/hotflow?id={}&mid={}&max_id_type=0'.format(weibo_id,weibo_id)else:if max_id == '0':#max_id=0，说明没有下一页了，结束循环print('max_id==0,break now')breakurl='/comments/hotflow?id={}&mid={}&max_id={}&max_id_type=0'.format(weibo_id,weibo_id,max_id)response = requests.get(url, headers=headers)#ok = response.json()['ok']#print(ok)print(response.status_code)max_id=response.json()['data']['max_id']#print(response.json()['data']['max_id'])print(max_id)datas= response.json()['data']['data']page_list = []id_list = []text_list=[]time_list=[]like_count_list=[]source_list=[]username_list=[]user_id_list=[]user_gender_list=[]follow_count_list=[]followers_count_list=[]for data in datas:page_list.append(page)id_list.append(data['id'])dr=pile(r'<[^>]+>',re.S)#用正则表达式清洗评论数据text2 = dr.sub('', data['text'])text_list.append(text2)#评论内容time_list.append(trans_time(data['created_at']))#评论时间like_count_list.append(data['like_count'])#点赞source_list.append(data['source'])#属地username_list.append(data['user']['screen_name'])#评论者姓名user_id_list.append(data['user']['id'])user_gender_list.append(data['user']['gender'])# 评论者性别follow_count_list.append(data['user']['follow_count'])#评论者关注数followers_count=str(data['user']['followers_count'])if(followers_count[-1]=='万'):followers_count=int(float(followers_count.strip('万')))*10000followers_count_list.append(followers_count)#评论者粉丝数#把列表拼接为dataFrame数据df=pd.DataFrame({'评论页码':page_list,'微博id':[weibo_id]*len(time_list),'评论id':id_list,'评论内容':text_list,'评论时间':time_list ,'评论点赞数':like_count_list,'评论属地':source_list,'评论者姓名':username_list ,'评论者id':user_id_list ,'评论者性别':user_gender_list,'评论者关注数':follow_count_list,'评论者粉丝数':followers_count_list,})# 表头if os.path.exists(wbComment_file):header = Noneelse:header = ['评论页码','微博id', '评论id','评论内容','评论时间','评论点赞数','评论属地', '评论者姓名','评论者id','评论者性别', '评论者关注数','评论者粉丝数']column=['评论页码','微博id', '评论id','评论内容','评论时间','评论点赞数','评论属地', '评论者姓名','评论者id','评论者性别', '评论者关注数','评论者粉丝数']# 保存到csv文件df.to_csv(wbComment_file, mode='a+', index=False, columns=column, header=header, encoding='utf-8-sig')#print('csv保存成功：{}'.format(bili_file))#print(df)print('第{}页爬取完成'.format(page))# 数据清洗、去重df = pd.read_csv(wbComment_file, engine='python', encoding='utf-8-sig')os.remove(wbComment_file)# 删除重复数据df.drop_duplicates(subset='评论内容', inplace=True, keep='first')# 再次保存csv文件column=header = ['评论页码', '微博id', '评论id', '评论内容', '评论时间', '评论点赞数', '评论属地', '评论者姓名','评论者id', '评论者性别', '评论者关注数', '评论者粉丝数']df.to_csv(wbComment_file, mode='a+', index=False, columns=column,header=header, encoding='utf-8-sig')print('数据清洗完成')weibo_comment_pic.main(wbComment_file)if __name__=='__main__':#目标微博https: // / detail / 4903111417922777#目标微博ID，可循环爬取多个（这里只爬一个）weiboID_list=[str(x) for x in input("请输入微博ID(示例：4903111417922777),以逗号分隔：").split(',')]#weiboID_list=['4903111417922777']#最大爬取页max_page=int(input("请输入搜索的页数"))#调用爬取get_bili_comment(weiboID_list=weiboID_list,max_page=max_page)

九、weibo_comment_pic.py

# 允许副本存在，忽略报错import osimport pandas as pdimport matplotlib.pyplot as pltfrom matplotlib import font_managerimport numpy as npos.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"def view(info,weibo_file):my_font = font_manager.FontProperties(fname='./STHeiti-TC-Medium.ttf') # 设置中文字体（图标中能显示中文）likes = info['评论点赞数'] # 点赞数reply = info['评论者粉丝数'] # 粉丝数forward = info['评论者关注数'] # 关注数author = info['评论者姓名'] # 作者，因为内容太长了# print(comment)# 为了坐标轴上能显示中文plt.rcParams['font.sans-serif'] = ['SimHei']plt.rcParams['axes.unicode_minus'] = False# **********************************************************************综合评分和播放量对比# *******点赞数条形图fig, ax1 = plt.subplots()length = len(author)plt.bar(x=np.arange(length), tick_label=author, height=likes, color='blue') # 设置柱状图plt.title('评论点赞数、粉丝数和关注数的数据分析', fontproperties=my_font) # 表标题ax1.tick_params(labelsize=6)plt.xlabel('微博内容') # 横轴名plt.ylabel('评论点赞数') # 纵轴名plt.xticks(rotation=90, color='green') # 设置横坐标变量名旋转度数和颜色# *******评论者粉丝数折线图ax2 = ax1.twinx() # 组合图必须加这个ax2.plot(reply, color='red') # 设置线粗细，节点样式# *******评论者关注数折线图ax2.plot(forward, color='yellow') # 设置线粗细，节点样式plt.ylabel('粉丝/关注数') # y轴plt.plot(1, label='评论者点赞数', color="blue", linewidth=5.0) # 图例#plt.plot(1, label='评论者粉丝数', color="red", linewidth=1.0, linestyle="-") # 图例#plt.plot(1, label='评论者关注数', color="yellow", linewidth=1.0, linestyle="-") # 图例plt.legend()plt.savefig('.\图片\pic-{}.png'.format(weibo_file), dpi=1000, bbox_inches='tight') # 保存至本地plt.show()def main(weibo_file):info = pd.read_csv(weibo_file,engine='python', encoding='utf-8-sig')info = info.nlargest(100, '评论点赞数')info = info.reset_index(drop=True)view(info,weibo_file)if __name__ == '__main__':main('weiboComment_15pages_4903111417922777.csv')

十、weibo_pic.py

# 允许副本存在，忽略报错import osimport pandas as pdimport matplotlib.pyplot as pltfrom matplotlib import font_managerimport numpy as npos.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"def view(info,weibo_file):my_font = font_manager.FontProperties(fname='./STHeiti-TC-Medium.ttf') # 设置中文字体（图标中能显示中文）likes = info['点赞数'] # 点赞数reply = info['评论数'] # 评论数forward = info['转发数'] # 转发数author = info['微博作者'] # 作者，因为内容太长了# print(comment)# 为了坐标轴上能显示中文plt.rcParams['font.sans-serif'] = ['SimHei']plt.rcParams['axes.unicode_minus'] = False# **********************************************************************综合评分和播放量对比# *******点赞数条形图fig, ax1 = plt.subplots()length = len(author)plt.bar(x=np.arange(length), tick_label=author, height=likes, color='blue') # 设置柱状图plt.title('点赞数、评论数和转发数的数据分析', fontproperties=my_font) # 表标题ax1.tick_params(labelsize=6)plt.xlabel('微博内容') # 横轴名plt.ylabel('点赞数') # 纵轴名plt.xticks(rotation=90, color='green') # 设置横坐标变量名旋转度数和颜色# *******评论数折线图ax2 = ax1.twinx() # 组合图必须加这个ax2.plot(reply, color='red') # 设置线粗细，节点样式# *******转发数折线图ax2.plot(forward, color='yellow') # 设置线粗细，节点样式plt.ylabel('评论/转发数') # y轴plt.plot(1, label='点赞数', color="blue", linewidth=5.0) # 图例#plt.plot(1, label='评论数', color="red", linewidth=1.0, linestyle="-") # 图例#plt.plot(1, label='转发数', color="yellow", linewidth=1.0, linestyle="-") # 图例plt.legend()plt.savefig('.\图片\pic-{}.png'.format(weibo_file), dpi=1000, bbox_inches='tight') # 保存至本地plt.show()def main(weibo_file):info = pd.read_csv(weibo_file,engine='python', encoding='utf-8-sig')info = info.nlargest(100, '点赞数')info = info.reset_index(drop=True)view(info,weibo_file)if __name__ == '__main__':main('微博清单_疫情_前10页.csv')

十一、weibo_top.py

import os.pathimport refrom jsonpath import jsonpathimport requestsimport pandas as pdfrom fake_useragent import UserAgentimport weibo_top_picimport weibo_top_pieimport draw_clouddef get_weibo_top():keyword=list(['realtimehot','gym','game','fun'])for search_keyword in keyword:# 保存文件名v_weibo_file = '微博top_{}.csv'.format(search_keyword)# 如果csv存在，先删除if os.path.exists(v_weibo_file):os.remove(v_weibo_file)print('微博榜单存在，已删除：{}'.format(v_weibo_file))print('===开始爬取{}微博榜单==='.format(search_keyword))# 请求头ua = UserAgent()headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36 Edg/113.0.1774.42","accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7","accept-encording": "gzip, deflate, br"}#请求地址url='/api/container/getIndex'#请求参数params={"containerid":"106003type=25&t=3&disable_hot=1&filter_type={}".format(search_keyword),"title": "微博热搜","show_cache_when_error": 1,"extparam": "seat=1&dgr=0&filter_type=realtimehot&region_relas_conf=0&pos=0_0&c_type=30&lcate=1001&mi_cid=100103&cate=10103&display_time=1684642048&pre_seqid=144917672","luicode": 10000011,"lfid": 231583,}#发送请求r=requests.get(url,headers=headers,params=params)print(r.status_code)#解析json数据cards=r.json()["data"]["cards"][0]["card_group"]#热搜内容text_list=jsonpath(cards,'$..desc')print('text_list is:')print(text_list)#热搜连接地址href_list = jsonpath(cards, '$..scheme')# 热搜排名order_list = jsonpath(cards, '$..pic')# 热搜热度view_count_list = jsonpath(cards, '$..desc_extr')j=1for i in range(0, len(order_list)):if order_list[i] == 'https://simg./0408_search_point_orange.png':order_list[i] = '无'view_count_list[i]=0continueif order_list[i] == "https://simg./0205110043_img_search_stick%403x.png":view_count_list.insert(0, 0)order_list[i] = '无'continueview_count_list[i]=str(view_count_list[i])view_count_list[i]=int(re.sub("\D", "", view_count_list[i]))order_list[i] = jj = j + 1print(len(order_list),len(text_list),len(view_count_list),len(href_list))df=pd.DataFrame({'热搜排名':order_list,'热搜内容': text_list,'热搜热度': view_count_list,'热搜连接地址': href_list,})#表头if os.path.exists(v_weibo_file):header=Noneelse:header=['热搜排名','热搜内容','热搜热度','热搜连接地址']column = ['热搜排名','热搜内容','热搜热度','热搜连接地址']#保存到csv文件df.to_csv(v_weibo_file,mode='a+',index=False,columns=column, header=header, encoding='utf-8-sig')print('csv保存成功：{}'.format(v_weibo_file))weibo_top_pic.main(v_weibo_file)weibo_top_pie.pie(v_weibo_file)#draw_cloud.draw_cloud(v_weibo_file)if __name__=='__main__':#调用爬取微博函数get_weibo_top()

十二、weibo_top_pic.py

十三、weibo_top_pie.py

import pandas as pdimport numpy as npfrom pyecharts import options as optsfrom pyecharts.charts import Pieimport matplotlib.pyplot as pltdef pie(weibo_file):plt.rcParams['font.family']=['SimHei']plt.rcParams['axes.unicode_minus']=Falsedata=pd.read_csv(weibo_file,engine='python', encoding='utf-8-sig')df1=data['热搜内容']df2=data['热搜热度']X=df1Y=[]s=sum(df2)for i in df2:a=i/sa=round(a,2)Y.append(a)plt.figure(figsize=(12, 12))plt.pie(x=Y,labels=X,wedgeprops={'width': 0.4},startangle=90,autopct='%.2f%%',pctdistance=0.9)plt.title('热搜对应的热度占比',fontsize=20)plt.savefig('.\图片\pie-{}.png'.format(weibo_file), dpi=1000, bbox_inches='tight') # 保存至本地plt.show()if __name__ == '__main__':pie('微博top_realtimehot.csv')

十四、pachong.py

import weiboimport weibo_topimport weibo_commentimport bilibiliimport bili_commentnet=int(input("请选择爬取的网站：1.微博 2.b站 3.停止爬取"))while(net!=3):if (net==1):choice1=int(input("请选择爬取的方向：1.排行榜 2.关键词 3.评论"))if(choice1==1):# 调用爬取微博函数weibo_top.get_weibo_top()if (choice1 == 2):# 爬取关键字search_keyword = input("请输入搜索的关键词")# 爬取页数max_search_page = int(input("请输入搜索的页数"))# 调用爬取微博函数weibo.get_weibo_list(v_keyword=search_keyword, v_max_page=max_search_page)if (choice1 == 3):# 目标微博ID，可循环爬取多个（这里只爬一个）weiboID_list = [str(x) for x in input("请输入微博ID(示例：4903111417922777),以逗号分隔：").split(',')]# 最大爬取页max_page = int(input("请输入搜索的页数"))# 调用爬取weibo_comment.get_bili_comment(weiboID_list=weiboID_list, max_page=max_page)if (net==2):choice2=int(input("请选择爬取的方向：1.排行榜 2.关键词 3.评论"))if(choice2==1):# 初始化b = bilibili.bilihot()# 调用搜索b.findall()# 储存数据b.storeCsvdata()if (choice2 == 2):# search: 你需要搜索的数据search = input("请输入搜索的关键词")# page: 需要查看的页数page = int(input("请输入搜索的页数"))# 初始化类a = bilibili.bilisearch(search, page)# 将爬取的数据存入excel文件中a.findall()if (choice2 == 3):# 视频bv号,循环爬取多个视频评论bv_list = [str(x) for x in input("请输入视频bv号(示例：BV1Ss4y1M7KT,BV1VM411N7qc),以逗号分隔：").split(',')]# 最大爬取页max_page = int(input("请输入搜索的页数"))# 调用爬取bili_comment.get_bili_comment(bv_list=bv_list, max_page=max_page)net = int(input("请选择爬取的网站：1.微博 2.b站 3.停止爬取"))