300字范文 > Python爬取百度搜索风云榜实时热点.

Python爬取百度搜索风云榜实时热点.

时间：2022-08-04 16:09:58

Python爬虫实战源码合集（持续更新）

百度搜索风云榜：/

源码：

import osimport jsonfrom datetime import datetimefrom datetime import timezonefrom datetime import timedeltafrom collections import OrderedDictimport requestsfrom bs4 import BeautifulSoupdef get_utc8now():utcnow = datetime.now(timezone.utc)utc8now = utcnow.astimezone(timezone(timedelta(hours=8)))return utc8nowdef save_as_json(filename, records):dict_obj = {}if os.path.exists(filename):with open(filename, 'r', encoding='utf-8') as f:dict_obj = json.load(f, object_pairs_hook=OrderedDict)time_str = str(get_utc8now())for keyword, search_index in records:time_count_dict = {'time': time_str, 'count': search_index}dict_obj.setdefault(keyword, []).append(time_count_dict)with open(filename, 'w', encoding='utf-8') as f:json.dump(dict_obj, f, indent=4, separators=(',',': '),ensure_ascii=False, sort_keys=False)def crawl_baidu_top(buzz_no=1):response = requests.get('/buzz?b={}'.format(buzz_no))response.encoding = 'gb18030'soup = BeautifulSoup(response.text, 'html.parser')table_tag = soup.find('table', {'class': 'list-table'})item_tags = table_tag.find_all('tr')keywords, search_indices = [], []for item in item_tags:keyword_tag = item.find('td', {'class': 'keyword'})last_tag = item.find('td', {'class': 'last'})if (keyword_tag is not None) and (last_tag is not None):keyword_title_tag = keyword_tag.find('a', {'class': 'list-title'})keywords.append(keyword_title_tag.text.strip())search_indices.append(last_tag.text.strip())return list(zip(keywords, search_indices))if __name__ == '__main__':now = get_utc8now()year_str = now.strftime('%Y')date_str = now.strftime('%Y%m%d')os.makedirs(year_str, exist_ok=True)filename = os.path.join(year_str, '{} 实时热点.json'.format(date_str))records = crawl_baidu_top()save_as_json(filename, records)

运行：

再次运行：

本内容不代表本网观点和政治立场，如有侵犯你的权益请联系我们处理。

网友评论

网友评论仅供其表达个人看法，并不表明网站立场。