之前写过爬取京东商品导航信息,现在献上爬取京东商品详情页信息。
#爬取京东商品详情页信息
#2017/7/30import requests
from bs4 import BeautifulSoup
import os
import csv
import re
import json
import time#爬取页面链接
def make_a_link(keyword,page):try:r = requests.get("https://search.jd.com/Search?keyword=" + keyword +'&enc=utf-8&page=' + str(2*page-1))r.raise_for_statusprint('正在爬取第{}页...'.format(page))print('---'*45)r.encoding = 'gbk'return r.textexcept:print('链接错误!!!')return ''#爬取页面链接
def find_only_link(html):soup = BeautifulSoup(html,'lxml')links = soup.find_all('div',class_='gl-i-wrap')return (link.find('div',class_='p-name p-name-type-2').a['href'] for link in links)#页面链接的生成表达式#链接单页面
def link_to_url(link):try:r = requests.get(link)r.raise_for_statusr.encoding = 'gbk'return r.textexcept:print('此页无法链接!!!')return ''#爬取商品价格
def getprice(purl):uid = re.match(r'.+?(\d+).+',purl).group(1)content = link_to_url('https://p.3.cn/prices/mgets?skuIds=J_' + uid)jd = json.loads(content.lstrip('[').rstrip(']\n'))#生成json数据格式return jd['p']#爬取商品评论
def getcomment(purl):uid = re.match(r'.+?(\d+).+',purl).group(1)content = link_to_url('https://club.jd.com/comment/productCommentSummaries.action?referenceIds=' + uid)jd = json.loads(content)comment = []jds = jd['CommentsCount'][0]comment.append(jds['CommentCountStr'])#评论数comment.append(jds['GoodCountStr'])#好评数comment.append(jds['GoodRate'])#好评率return comment#爬取商品名称
def getname(purl):uid = re.match(r'.+?(\d+).+',purl).group(1)content = link_to_url('https://c.3.cn/recommend?&methods=accessories&sku=' + uid + '&cat=9987%2C653%2C655')try:jd = json.loads(content)return jd['accessories']['data']['wName']except:return ''#爬取卖家
def getseller(purl):uid = re.match(r'.+?(\d+).+',purl).group(1)content = link_to_url('https://chat1.jd.com/api/checkChat?pid=' + uid + '&returnCharset=utf-8')try:jd = json.loads(content.lstrip('null(').rstrip(');'))try:return jd['seller']except:return ''except:''#保存到csv
def save_to_csv(ulist,keyword):path = 'D:/数据/'if not os.path.exists(path):os.mkdir(path)with open(path + '京东' + keyword + '数据.csv','w+' ) as f:writer = csv.writer(f)writer.writerow(['商品','价格','店铺','链接','评论数','好评数','好评率'])for i in range(len(ulist)):if ulist[i] and ulist[i][0]:writer.writerow([ulist[i][0],ulist[i][1],ulist[i][2],ulist[i][3],ulist[i][4],ulist[i][5],ulist[i][6]])#主函数
def relmain(keyword):#高阶函数def main(page):r = re.compile(r'.*?html')ulist = []for p in range(page):p += 1text = make_a_link(keyword,p)for url in find_only_link(text):ul = []if r.match(url):if getname(url):ul.append(getname(url))#商品名称print(getname(url))ul.append(getprice(url))#价格ul.append(getseller(url))#店铺ul.append('https:' + url)#链接print('https:' + url)ul.extend(getcomment(url))#评论print('*-*' * 45)ulist.append(ul)save_to_csv(ulist,keyword)return mainif __name__ == '__main__':keyword = input('输入要爬取的商品:')pages = int(input('输入要爬取的页数:'))time_start = time.time()relmain(keyword)(pages)print('耗时{}秒。'.format(time.time() - time_start))#爬取所需时间