# -*- coding:utf-8 -*-
import requests
import re
import random
import time
import json
from requests.packages.urllib3.exceptions import InsecureRequestWarning
import pandas as pd
requests.packages.urllib3.disable_warnings(InsecureRequestWarning) ###禁止提醒SSL警告
class jd(object):
def __init__(self):
self.s = requests.session() ## 创建一个session对象
headers = {
'Accept': '*/*',
'Accept-Language': 'zh-CN',
'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 10_3_3 like Mac OS X) AppleWebKit/603.3.8 (KHTML, like Gecko) FxiOS/10.6b8836 Mobile/14G60 Safari/603.3.8',
}
self.s.headers.update(headers) ### 设置请求头
def getpid(self,url,name,path):
self.shopid=re.search('index-(.*?).html',url).group(1) ###获取店铺ID号
#searchurl='https://shop./search/search?shopId='+str(self.shopid)
t = int(time.time() * 1000) ###13位时间戳
searchurl = 'https://shop./search/searchWareAjax.json?r=' + str(t) ##请求数据网址
headers={
'origin':'https://shop.',
'referer':'https://shop./search/search?shopId='+str(self.shopid),
}
self.s.headers.update(headers) ###更新请求头
wareId_list=[]
wname_list=[]
jdPrice_list=[]
for i in range(1,10000): ###爬取页数范围 没有找到商品后会自动退出循环
time.sleep(random.random()) ##随机延时0-1秒
##设置请求数据
data={
'shopId':str(self.shopid),
'searchPage':str(i),
'keyword':'',
'searchSort':'0',
'shopCategoryId':'',
'clickSku':'',
'skus':'',
'jdDeliver':'0',
'pageFrom':'',
}
time.sleep(random.random()) ##随机延时0-1秒 没有这句可能会出错
req=self.s.post(url=searchurl,data=data,verify=False).text ###获取数据
print(req)
wareId=re.findall('"wareId":(.*?),',req) ##获取商品ID
wname=re.findall('"wname":"(.*?)",',req) ###获取商品名称
jdPrice=re.findall('"jdPrice":"(.*?)",',req) ###获取商品价格
if wareId==[]: ###如果没有找到ID退出循环
break
#####处理数据
wareId_list.extend(wareId)
wname_list.extend(wname)
jdPrice_list.extend(jdPrice)
wareId_l=len(wareId_list)
name_list=[]
name_list.append(name)
name_list.extend(name_list*(wareId_l-1))
jddata={
'name':name_list,
'wareId':wareId_list,
'wname':wname_list,
'jdPrice':jdPrice_list
}
df = pd.DataFrame(data=jddata)
df.to_csv(path + r'\jdmall.csv', index=False, encoding="GB18030") ###保存csv文件
if __name__ == '__main__':
url='/index-1000000182.html' ##店铺地址
path = r'E:\JD\test' ###保存路径
jd=jd()
jd.getpid(url,'华硕',path) ###华硕 自定义备注的字段
京东代码已更新,本帖代码已无效!!
已更新:/weixin_39416561/article/details/83104837