300字范文,内容丰富有趣,生活中的好帮手!
300字范文 > “搜狗问问”问答语料爬虫

“搜狗问问”问答语料爬虫

时间:2021-12-08 23:55:51

相关推荐

“搜狗问问”问答语料爬虫

本人的毕业设计是构建一个基于机器学习的问答系统,需要用到大量的问题答案对,并且每个问题下都应有相应的分类标签。

鉴于网络上有分类标签的问答语料很少被人公开,本人亲自编写爬虫来抓取语料。

中文的问答网站有:百度知道、知乎、悟空问答、奇虎问答、搜狗问问等,通过筛选,最后我锁定“搜狗问问”网站。原因是:

不具备反爬虫机制或者说连最基本的频繁次数限制都没有。每个问题都有一个大标签和多个小标签。URL的结构分明

爬虫使用基于树的层次遍历算法:

使用Python编写,代码如下:

#coding:utf-8import urllib2import refrom bs4 import BeautifulSoupimport codecsimport sysimport jsonstdi,stdo,stde=sys.stdin,sys.stdout,sys.stderr reload(sys)sys.stdin,sys.stdout,sys.stderr=stdi,stdo,stde sys.setdefaultencoding('utf-8')'''从搜狗问问爬取每个分类标签下的问题答案集,每个问题追加为json格式:{"answer": ["我一直用的是云末感觉还是挺稳定的。"],"tag": {"75023": "英雄联盟"},"question": "网易uu加速器加速lol怎么样","hasAnswer": true}'''global rootUrl#加载页面内容def LoadPage(url):try:user_agent = "Mozilla/5.0(WindowsNT6.1;WOW64;rv:6.0)Gecko/0101Firefox/6.0"headers = {"User-Agent" : user_agent}request = urllib2.Request(url,headers = headers)response = urllib2.urlopen(request)html = response.read()allTitles = []allTitles = GetTitle(html)if allTitles:QuestionAnswers = []QuestionAnswers = GetQuestionAnswers(allTitles)if QuestionAnswers:return QuestionAnswersexcept Exception,e:print str(e)#获取问题标题def GetTitle(html):allTitles = []myAttrs={'class':'sort-lst-tab'}bs = BeautifulSoup(html)titles = bs.find_all(name='a',attrs=myAttrs)for titleInfo in titles:item = {}titleInfoStr = str(titleInfo)questionInfo = re.findall(r'sort-tit">(.*?)</p>',titleInfoStr,re.S)question = questionInfo[0]answerInfo = re.findall(r'sort-rgt-txt">(.*?)</span>',titleInfoStr,re.S)if u'0个回答' in answerInfo:item['hasAnswer'] = Falseelse:item['hasAnswer'] = Truetags = re.findall(r'sort-tag" data-id=(.*?)/span>',titleInfoStr,re.S)tagInfo = {}for tag in tags:tagId = re.findall(r'"(.*?)">',tag,re.S)tagName = re.findall(r'>(.*?)<',tag,re.S)tagInfo[tagId[0]] = tagName[0]if tagId not in smalltags.keys():smalltags[tagId[0]] = tagName[0]subUrl = re.findall(r'href="(.*?)"',titleInfoStr,re.S) url = rootUrl + subUrl[0]item['url'] = urlitem['question'] = questionitem['tag'] = tagInfoallTitles.append(item)return allTitles#获取问题和答案def GetQuestionAnswers(allTitles):QuestionAnswers = []for item in allTitles:QuestionAnswer = {}if item['hasAnswer']: Answers = []url = item['url']try:user_agent = "Mozilla/5.0(WindowsNT6.1;WOW64;rv:6.0)Gecko/0101Firefox/6.0"headers = {"User-Agent" : user_agent}request = urllib2.Request(url,headers = headers)response = urllib2.urlopen(request)html = response.read()questionAttrs={'id':'question_title_val'}answerAttrs={'class':'replay-info-txt answer_con'}bs = BeautifulSoup(html)#questions = bs.find_all(name='span',attrs=questionAttrs) questions = re.findall(r'question_title_val">(.*?)</span>',html,re.S)question = questions[0]answers = bs.find_all(name='pre',attrs=answerAttrs)if answers:for answer in answers:answerStr = ''if "<p>" in str(answer):segements = re.findall(r'<p>(.*?)</p>',str(answer),re.S)for seg in segements:answerStr = answerStr + str(seg)if answerStr.strip() != "":Answers.append(answerStr.strip())else:noPanswer = re.findall(r'answer_con">(.*?)</pre>',str(answer),re.S)Answers.append(noPanswer[0])QuestionAnswer['answer'] = AnswersQuestionAnswer['question'] = question QuestionAnswer['tag'] = item['tag']QuestionAnswer['hasAnswer'] = Trueexcept Exception,e:print str(e)else:QuestionAnswer['question'] = item['question']QuestionAnswer['tag'] = item['tag']QuestionAnswer['answer'] = ''QuestionAnswer['hasAnswer'] = FalseQuestionAnswers.append(QuestionAnswer)return QuestionAnswers#if __name__ == '__main__':baseurl = "/cate/tag?"rootUrl = ''#问题分类标签tagids = ['101','146','111','163614','50000010','121','93474','9996','148','50000032','135','125','9990','465873']global smalltagssmalltags = {}#遍历标签for tagid in tagids:f = codecs.open('../../../origin_data/wenwen_corpus/QuestionAnswers/'+str(tagid)+'/test.json','a',encoding='utf-8')t = codecs.open('../../../origin_data/wenwen_corpus/QuestionAnswers/'+ str(tagid) +'/smalltag.json','a',encoding="utf-8")#每个标签拉n个页面print u'标签:',tagidfor i in range(5000,0,-1):tag = 'tag_id='+ tagidtp = '&tp=0'pno = '&pno='+str(i)ch = '&ch=ww.fly.fy'+str(i+1)+'#questionList'url = baseurl + tag + tp + tp + pno + chprint urlQuestionAnswers = []QuestionAnswers = LoadPage(url)if QuestionAnswers:for qa in QuestionAnswers:jsonStr = json.dumps(qa,ensure_ascii=False)f.write(jsonStr.encode("utf-8")+'\n')#保存tagjson.dump(smalltags,t,ensure_ascii=False)t.close()f.close()

爬取的数据格式:

本内容不代表本网观点和政治立场,如有侵犯你的权益请联系我们处理。
网友评论
网友评论仅供其表达个人看法,并不表明网站立场。