【Python】【爬虫】爬取京东商品用户评论(分析+可视化)

首先给大家推荐一下我老师大神的人工智能教学网站。教学不仅零基础,通俗易懂,而且非常风趣幽默,还时不时有内涵黄段子!点这里可以跳转到网站

由于京东网站的变化通过这种方法已经拿不到评论数据了,我另外基于scrapy写了一个新的,等有时间发出来。不过下面这个方法还是值得参考的

———————————————————————————————————————

1:在商品页面f12或右键审查元素,点击network 输入存用户评论的json文件productPageComments 刷新

  点击跳转到第二页评论的按钮会在刷出一个json文件

  双击这个文件打开网页复制网页url到记事本以便找出规律

叮!规律识破!

2:定个小目标先爬它个15页

先存到本地一波.

import requestsimport urllib3import jsonimport urllibimport urllib.requestfrom bs4 import BeautifulSoup for i in range(1, 15):    url1 = 'https://sclub.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98vv4403&productId=3487485&score=3&sortType=5&page='    url2 = str(i)    uel3 = '&pageSize=10&isShadowSku=0&rid=0&fold=1'    finalurl = url1+url2+uel3    xba = requests.get(finalurl)for i in range(1, 15):    u1 = "E:\\pachong1\\"    u2 = str(i)    u3 = ".json"    finalu = u1+u2+u3    file = open(finalu, "w")    file.write(xba.text[26:-2])    file.close()st = xba.textprint('finished')




3再定个中等目标,爬取150页用户具体评论内容,并存入本地.

import requestsimport urllib3import jsonimport urllibimport urllib.requestfrom bs4 import BeautifulSoupfor i in range(1, 150):    url1 = 'https://sclub.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98vv4403&productId=3487485&score=3&sortType=5&page='    url2 = str(i)    uel3 = '&pageSize=10&isShadowSku=0&rid=0&fold=1'    finalurl = url1+url2+uel3    xba = requests.get(finalurl)    data=json.loads(xba.text[26:-2])    for i in data['comments']:        content = i['content']        print("评论内容".format(content))        file=open("E:\\pachong1\\comm.txt", 'a')        file.writelines(format(content))print("finished")




以下为该商品前150页用户评论内容

4终极目标:清洗数据,统计每个词的出现次数,并用词云进行展示

这里需要stopwords文件和simhei.ttf都可以在网上下的得到,但stopwords文件需要下载券,这里我下载之后放到了百度云上

stopwords: https://pan.baidu.com/s/1K-fbpcbHJzM67Jq1O4YLZQ

import reimport jiebaimport pandas as pimport numpyfrom wordcloud import WordCloudimport matplotlib.pyplot as pltimport matplotlib#读文件file1 = open("E:\\pachong1\\comm.txt", 'r')xt = file1.read()pattern = re.compile(r'[\u4e00-\u9fa5]+')filedata = re.findall(pattern, xt)xx = ''.join(filedata)file1.close()# 清洗数据clear = jieba.lcut(xx)cleared = p.DataFrame({'clear': clear})#print(clear)stopwords = p.read_csv("chineseStopWords.txt", index_col=False, quoting=3, sep="\t", names=['stopword'], encoding='GBK')cleared = cleared[~cleared.clear.isin(stopwords.stopword)]#print(std)count_words=cleared.groupby(by=['clear'])['clear'].agg({"num": numpy.size})count_words=count_words.reset_index().sort_values(by=["num"], ascending=False)#print(count_words)# 词云展示wordcloud=WordCloud(font_path="simhei.ttf",background_color="white",max_font_size=250,width=1300,height=800) #指定字体类型、字体大小和字体颜色word_frequence = {x[0]:x[1] for x in count_words.head(200).values}wordcloud=wordcloud.fit_words(word_frequence)plt.imshow(wordcloud)plt.axis("off")plt.colorbar()   #颜色条plt.show() #wctext = open('E:\\pachong1\\comm1.txt', 'r')print("finish")

画出词云,当然还可以自定义处各种效果

精简完整版-不进行中间存储

ps:你需要将stopwords文件与背景图片放入当前目录下,或者在程序中指定响应文件路径。

import requestsimport jsonimport reimport jiebaimport pandas as pdimport numpyfrom wordcloud import WordCloudimport matplotlib.pyplot as pltfrom os import pathimport numpy as npfrom PIL import Image# 数据爬取模块def get_comments():    all_comments = ""    for i in range(1, 90):        url2 = str(i)        url1c = 'https://sclub.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98vv2&productId=7652137&score=0&sortType=5&page='        url3c = '&pageSize=10&isShadowSku=0&rid=0&fold=1'        finalurlc = url1c+url2+url3c        xba = requests.get(finalurlc)        data=json.loads(xba.text[23:-2])        for j in data['comments']:            content = j['content']            all_comments = all_comments+content        print(i)    print("finished")    return all_comments # 数据清洗处理模块def data_clear():    xt = get_comments()    pattern = re.compile(r'[\u4e00-\u9fa5]+')    filedata = re.findall(pattern, xt)    xx = ''.join(filedata)    clear = jieba.lcut(xx)   # 切分词    cleared = pd.DataFrame({'clear': clear})    stopwords = pd.read_csv("chineseStopWords.txt", index_col=False, quoting=3, sep="\t", names=['stopword'], encoding='GBK')    cleared = cleared[~cleared.clear.isin(stopwords.stopword)]    count_words = cleared.groupby(by=['clear'])['clear'].agg({"num": numpy.size})    count_words = count_words.reset_index().sort_values(by=["num"], ascending=False)    return count_words #词云展示模块def make_wordclound():    d = path.dirname(__file__)    msk = np.array(Image.open(path.join(d, "me.jpg")))    wordcloud = WordCloud(font_path="simhei.ttf",mask=msk,background_color="#EEEEEE",max_font_size=250,width=1300,height=800) #指定字体类型、字体大小和字体颜色    word_frequence = {x[0]:x[1] for x in data_clear().head(200).values}    wordcloud = wordcloud.fit_words(word_frequence)    plt.imshow(wordcloud)    plt.axis("off")    plt.show() if __name__=="__main__":    make_wordclound()    print("finish")

点这里可以跳转到人工智能网站