南京音乐推荐联合社

爬取网易云音乐某首歌曲的全部评论,以及生成相对应的词云图

2019-10-18 15:00:42

写在前面:爬取网页时,难免会遇到翻页的问题,在本次爬取过程也遇到了,在解决翻页的过程中,发现网上已经有人实现了(地址:http://www.cnblogs.com/lyrichu/p/6635798.html),通过总结发现在这是通用的,不过如果是使用python3的话,还是会有一个小小的坑的。 环境:python3.5、win10、Pycharm

话不多说,直接上代码

get_comments.py
  1. # encoding: utf-8

  2. """

  3. @author: Sunmouren

  4. @contact: sunxuechao1024@gmail.com

  5. @time: 2018/5/12 22:42

  6. @desc: 爬去网易云音乐全部评论

  7. """

  8. import base64

  9. import json

  10. import codecs

  11. import requests

  12. from Crypto.Cipher import AES

  13. headers = {

  14.    'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36",

  15.    'Cookie': "_ntes_nuid=bae6bddbcaae44ff2fb705f942b04429; usertrack=ezq0plqLoYlIU74bMVu/Ag==; _ntes_nnid=06b1f38610caa21ede9d767ad2370183,1519100299969; _ga=GA1.2.1528031140.1519100301; __f_=1521282707045; P_INFO=m15595757119_1@163.com|1524116518|0|mail163|00&99|CN&1524116454&mailsettings#zhj&330300#10#0#0|155119&1|mailsettings|15595757119@163.com; nts_mail_user=15595757119@163.com:-1:1; _iuqxldmzr_=32; WM_TID=bi2ul0cBG7fnlIOPLQRMlothc2PY0i5k; __e_=1525966530859; JSESSIONID-WYYY=yCXaNbRPwzwYr0mj7kc%2F31mUGrgjXDUhwcJUpWtfXRC3J33m0lJtADdIqI2H5g3fkFTtptyQtzJgxN1axfJxUZ0uv7fPUrT1kvaGuG0c2uI2Ge05oAfi6PpdT5281uNH4KmF3uS7Mqm3up8S5cEN47vDwnOQCMPt4WmGAl%2BmNeKiv%5C9p%3A1526119472806; __utma=94650624.1528031140.1519100301.1525754678.1526117673.2; __utmc=94650624; __utmz=94650624.1526117673.2.2.utmcsr=sogou.com|utmccn=(referral)|utmcmd=referral|utmcct=/link; __utmb=94650624.5.10.1526117673",

  16.    'Referer': "http://music.163.com/"

  17. }

  18. first_param = "{rid:\"\", offset:\"0\", total:\"true\", limit:\"20\", csrf_token:\"\"}"

  19. second_param = "010001"

  20. third_param = "00e0b509f6259df8642dbc35662901477df22677ec152b5ff68ace615bb7b725152b3ab17a876aea8a5aa76d2e417629ec4ee341f56135fccf695280104e0312ecbda92557c93870114af6c9d05c4f7f0c3685b7a46bee255932575cce10b424d813cfe4875d3e82047b97ddef52741d546b8e289dc6935b3ece0462db0a22b8e7"

  21. forth_param = "0CoJUm6Qyw8W8jud"

  22. def get_params(page):

  23.    iv = "0102030405060708"

  24.    first_key = forth_param

  25.    second_key = 16 * 'F'

  26.    if(page == 1):

  27.        first_param = '{rid:"", offset:"0", total:"true", limit:"20", csrf_token:""}'

  28.        h_encText = AES_encrypt(first_param, first_key, iv)

  29.    else:

  30.        offset = str((page - 1)*20)

  31.        first_param = '{rid:"", offset:"%s", total:"%s", limit:"20", csrf_token:""}' % (offset, 'false')

  32.        h_encText = AES_encrypt(first_param, first_key, iv)

  33.    h_encText = AES_encrypt(h_encText, second_key, iv)

  34.    return h_encText

  35. # 解密过程

  36. def AES_encrypt(text, key, iv):

  37.    pad = 16 - len(text) % 16

  38.    """

  39.    原有的这里就只有这样: text = text + pad * chr(pad)

  40.    不过如果用的是python3的话会出现错误: TypeError: can't concat bytes to str

  41.    这是因为python3不可以让bytes 和 str 直接连接, 解决可以加个类型判断,然后进行转换:

  42.    """

  43.    if type(text) is bytes:

  44.        text = text + (pad * chr(pad)).encode('utf-8')

  45.    else:

  46.        text = text + pad * chr(pad)

  47.    encryptor = AES.new(key, AES.MODE_CBC, iv)

  48.    encrypt_text = encryptor.encrypt(text)

  49.    encrypt_text = base64.b64encode(encrypt_text)

  50.    return encrypt_text

  51. def get_encSecKey():

  52.    encSecKey = "257348aecb5e556c066de214e531faadd1c55d814f9be95fd06d6bff9f4c7a41f831f6394d5a3fd2e3881736d94a02ca919d952872e7d0a50ebfa1769a7a62d512f5f1ca21aec60bc3819a9c3ffca5eca9a0dba6d6f7249b06f5965ecfff3695b54e1c28f3f624750ed39e7de08fc8493242e26dbc4484a01c76f739e135637c"

  53.    return encSecKey

  54. def get_json(url, params, encSecKey):

  55.    data = {

  56.        'params': params,

  57.        'encSecKey': encSecKey

  58.    }

  59.    response = requests.post(url, headers=headers, data=data)

  60.    return response.content

  61. def get_pages(url):

  62.    params = get_params(1)

  63.    encSeckey = get_encSecKey()

  64.    json_text = get_json(url, params, encSeckey).decode()

  65.    json_dict = json.loads(json_text)

  66.    print(json_text)

  67.    comments_num = int(json_dict['total'])

  68.    if (comments_num % 20 == 0):

  69.        pages = comments_num / 20

  70.    else:

  71.        pages = int(comments_num / 20) + 1

  72.    print("一共有%d页评论, %条评论" % (pages, comments_num))

  73.    return pages

  74. def get_comments(url, pages):

  75.    comments = list()

  76.    for page in range(pages):

  77.        params = get_params(page + 1)

  78.        encSeckey = get_encSecKey()

  79.        json_text = get_json(url, params, encSeckey).decode()

  80.        json_dict = json.loads(json_text)

  81.        for item in json_dict['comments']:

  82.            content = item['content']

  83.            comments.append(content)

  84.        print("第%d页抓取完毕" % (page + 1))

  85.    return comments

  86. def save_to_file(comments, filename):

  87.    with codecs.open(filename, 'a', encoding='utf-8') as f:

  88.        for comment in comments:

  89.            f.write(comment)

  90.    print("写入成功!")

  91. if __name__ == '__main__':

  92.    url = 'http://music.163.com/weapi/v1/resource/comments/R_SO_4_31877636?csrf_token='

  93.    pages = get_pages(url)

  94.    comments = get_comments(url=url, pages=pages)

  95.    save_to_file(comments=comments, filename="test.txt")

get_wordclound.py

  1. # encoding: utf-8

  2. """

  3. @author: Sunmouren

  4. @contact: sunxuechao1024@gmail.com

  5. @time: 2018/5/13 13:54

  6. @desc: 通过爬取的网页云音乐全部评论生成词云图

  7. """

  8. import jieba

  9. from wordcloud import WordCloud

  10. if __name__ == '__main__':

  11.    # 读取文本

  12.    text = open("test.txt", "r", encoding='utf-8').read()

  13.    # 分词

  14.    cut_text = jieba.cut(text)

  15.    result = "/".join(cut_text)

  16.    # 生成词云图, xingkai.ttf是中文字体,用到它是因为WordCloud本身好像没有支持中文的。

  17.    wordcloud = WordCloud(font_path=r"E:\PycharmWorkPlace\wordcloud_demo\xingkai.ttf", background_color='white',

  18.                          width=800,

  19.                          height=600, max_font_size=50,

  20.                          max_words=1000, mode='RGBA',colormap='pink')

  21.    wordcloud.generate(result)

  22.    wordcloud.to_file("test.png")


词云图(你能猜出是哪一首歌吗?)

写在后面:要注意需要用到的模块,如果没有安装的可以自行安装,最后github地址可以通过点击下面原文链接进行访问。

Copyright © 南京音乐推荐联合社@2017