300字范文,内容丰富有趣,生活中的好帮手!
300字范文 > 爬虫小实战(selenium) 数据小分析(pywebio pyecharts)python分析写在网页 爬取

爬虫小实战(selenium) 数据小分析(pywebio pyecharts)python分析写在网页 爬取

时间:2022-02-03 03:57:50

相关推荐

爬虫小实战(selenium) 数据小分析(pywebio pyecharts)python分析写在网页 爬取

爬取数据

通过selenium爬取世界500强企业数据

import timeimport requestsimport csvfrom selenium import webdriver# 目标网址,构造头部信息url = '版权问题,请查看项目地址'headers = {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,''application/signed-exchange;v=b3;q=0.9','Connection': 'keep-alive','User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) ''Chrome/97.0.4692.71 ''Safari/537.36 Edg/97.0.1072.55 '}def crawler():response = requests.get(url, headers=headers)if response.status_code != 200:print('access failed')returnchrome = webdriver.Chrome(r'chromedriver.exe')chrome.get(url)script = 'Object.defineProperty(navigator,"webdriver",{get:()=>undefined,});'chrome.execute_script(script)time.sleep(2)for i in range(1, 11):for j in range(1, 51):# selenium通过xpath定位获取数据rank = chrome.find_element_by_xpath('//*[@id="table1"]/tbody/tr[{}]/td[1]'.format(j)).textcompany = chrome.find_element_by_xpath('//*[@id="table1"]/tbody/tr[{}]/td[2]/a'.format(j)).textincome = chrome.find_element_by_xpath('//*[@id="table1"]/tbody/tr[{}]/td[3]'.format(j)).textprofit = chrome.find_element_by_xpath('//*[@id="table1"]/tbody/tr[{}]/td[4]'.format(j)).textnation = chrome.find_element_by_xpath('//*[@id="table1"]/tbody/tr[{}]/td[5]'.format(j)).text# 追加写入csvwith open(r'Fortune500.csv', 'a+', encoding='utf-8') as f:row = [rank, company, income, profit, nation]writer = csv.writer(f)writer.writerow(row)print(row)nextPage = chrome.find_element_by_xpath('//*[@id="table1_next"]')nextPage.click()if __name__ == '__main__':crawler()

写入csv后

使用openrefine进行清洗

清洗后得到

作图小分析

from pywebio.output import put_htmlimport csvfrom pyecharts.charts import Bar, Piefrom pyecharts import options as optsdef handle():put_html(proportionBar())put_html(proportionPie())put_html(Proportion())put_html(incomeProfit())def proportionBar():nationDict = {}with open(r'Fortune500After.csv', encoding='utf-8') as jd:for i in range(1):jd.readline() # 跳过第一行for row in csv.reader(jd):if row[4] not in nationDict:nationDict[row[4]] = 1else:nationDict[row[4]] += 1nationValueList = []nationKeyList = []for key in nationDict:nationValueList.append(nationDict[key])nationKeyList.append(key)bar = Bar()bar.add_xaxis(nationKeyList)bar.add_yaxis("世界500强数量", nationValueList)bar.set_global_opts(title_opts=opts.TitleOpts(title="各个国家拥有世界500强企业"),xaxis_opts=opts.AxisOpts(name_rotate=60, axislabel_opts={"rotate": 45}))return bar.render_notebook()def proportionPie():nationDict = {}with open(r'Fortune500After.csv', encoding='utf-8') as jd:for i in range(1):jd.readline() # 跳过第一行for row in csv.reader(jd):if row[4] not in nationDict:nationDict[row[4]] = 1else:nationDict[row[4]] += 1nationValueList = []nationKeyList = []for key in nationDict:nationValueList.append(nationDict[key])nationKeyList.append(key)pie = Pie()pie.add('数量', [list(z) for z in zip(nationKeyList, nationValueList)], radius='45%', center=["50%", "65%"])return pie.render_notebook()def incomeProfit():company = []income = []profit = []proportion = []with open(r'Fortune500After.csv', encoding='utf-8') as jd:for i in range(1):jd.readline() # 跳过第一行for row in csv.reader(jd):try:if float(row[3]) > 0:company.append(row[1])income.append(float(row[2]))profit.append(float(row[3]))temp = float(row[3]) / float(row[2]) * 100proportion.append(temp)except Exception:passbar = Bar(init_opts=opts.InitOpts(width='4000px', height='30000px'))bar.add_xaxis(company)bar.add_yaxis("营业收入", income)bar.add_yaxis("利润", profit)# bar.add_yaxis("利润占营业收入", proportion)bar.reversal_axis()bar.set_series_opts(label_opts=opts.LabelOpts(position="right"))bar.set_global_opts(title_opts=opts.TitleOpts(title="营业收入与利润(不包括利润小于0)"))return bar.render_notebook()def Proportion():company = []proportion = []with open(r'Fortune500After.csv', encoding='utf-8') as jd:for i in range(1):jd.readline() # 跳过第一行for row in csv.reader(jd):try:if float(row[3]) > 0:company.append(row[1])temp = float(row[3]) / float(row[2]) * 100proportion.append(temp)except Exception:passbar = Bar(init_opts=opts.InitOpts(width='4000px', height='30000px'))bar.add_xaxis(company)bar.add_yaxis("利润率", proportion)bar.reversal_axis()bar.set_series_opts(label_opts=opts.LabelOpts(position="right"))bar.set_global_opts(title_opts=opts.TitleOpts(title="利润率分析(不包括利润小于0)"))return bar.render_notebook()if __name__ == '__main__':handle()

随机端口得html文件查看

不清楚的地方联系邮箱:wes0018@

不要白嫖哦,点个赞,评个论,感谢

免费提供源码和csv下载地址

GitHub - weiensong/Fortune500: 爬取分析世界五百强

爬虫小实战(selenium) 数据小分析(pywebio pyecharts)python分析写在网页 爬取世界500强企业

本内容不代表本网观点和政治立场,如有侵犯你的权益请联系我们处理。
网友评论
网友评论仅供其表达个人看法,并不表明网站立场。