用python写一个爬取周杰伦所有歌词的爬虫

写一个爬虫爬一下周董的所有歌词看看这么多年他为啥这么火唱的都是什么主题的歌可以这么经久不衰，他凭啥被称为流行歌曲天王。废话不多说直接上代码今天比较晚了之后再慢慢完善讲解。代码比较low因为是编自学边完成的，所以只是实现了基本的功能，基本没怎么做优化。大家凑活着看吧先。

大概260多个文件，去完重后剩下一百多个。

第一个文件爬取所有歌名列表：

import requests
from bs4 import BeautifulSoup
import traceback #获取错误信息所用的库
#爬取周杰伦歌曲列表页面
def getMusicListHTML(url,kv,PXS):
   try:
       r = requests.get(url, headers = kv, timeout=30, proxies = PXS)
       r.raise_for_status()
       r.encoding = r.apparent_encoding
       return r.text
   except:
       print('爬取失败')
       return ''

#分析网页从中取出所有歌名以及href中的url
def parseListHTML(mclisthtml,mcname,mcurl): #mcname歌名列表 mcurl歌url列表
   try:
       soup = BeautifulSoup(mclisthtml, 'html.parser')
       ul_tag = soup.ul
       #li_list = ul_tag.find_all(li) #笨办法获取a标签及其内容
       #a_tag = ul_tag.find_all()
       div_tag = ul_tag.find_all('div',class_='name')
       #循环获取div 属性为 class=‘name’的标签
       for i_a in div_tag:
           a_tag = i_a.find('a')
           name = a_tag.string #歌名
           mc_url = a_tag.get('href') #歌名对应的url
           mcname.append(name)
           mcurl.append(mc_url) #这里可以尝试着用字典把两个一起储存起来
       return mcname,mcurl
   except:
       print('分析错误')
       #获取错误信息
       traceback.print_exc()
       exit()

#保存歌名到zjllist.txt 还有链接urllist.txt (去重！！！！！！)
'''def baocunname(mcname,mcurl):
   #循环保存歌名到zjllist.txt
   for name in mcname:
       #去重
       if name not in mcname:
           with open('zjllist.txt','a') as f:
               f.write(name+'\n')
               f.close()
           url = baocunurl(mcurl)
           with open('urllist.txt','a') as g:
               g.write(url+'\n')
               g.close()
       else:
           continue
def baocunurl(mcurl):
   #循环保存歌名到urllist.txt
   for url in mcurl:
       yield url'''
#保存歌名到zjllist.txt 还有链接urllist.txt
def baocun(mcname,mcurl,mcdict):
   #循环保存歌名到zjllist.txt
   #print(type(mcname))
   for name in mcname:
       #print(type(name))
       with open('zjllist.txt','a') as f:
           f.write(str(name)+'\n')
           f.close()
   #循环保存歌名url到urllist.txt
   for url in mcurl:
       with open('urllist.txt','a') as g:
           g.write(str(url)+'\n')
           g.close()
#生成映射字典
#倒序遍历列表，因为后增加的键会覆盖之前的键（同名的键）而音乐的链接是前面的比较新，可用，所以倒叙定义字典保证url是最新的
   #因为列表长度比索引值大1所以这里要-1 比如列表list=[a,b,c] 的长度为3 但是索引值是 0 1 2
   for i in range(len(mcname)-1,-1,-1): #自动生成长度
   #for i in range(1368,-1,-1): #运行程序后获得长度后手动更改长度
       mcdict[mcname[i]]=mcurl[i]
   with open('yingshe.txt','w') as h:
       h.write(str(mcdict))
       h.close()

'''#循环获取zjllist_end.txt文件中的歌名并从字典中取出对应歌名的url添加到musicurl.txt文件中，以便于生成完整的歌曲url 用于后续的歌名对应的歌曲的歌词爬取
def getMusic(mcdict,musicurl):
   with open('zjllist_end.txt','r') as f: #在使用这个函数之前要先执行shaixuan.py文件筛选歌名(中间涉及到人工手动筛选) 最后把结果保存到zjllist_end.txt中再调用这个函数
       namelist=f.read().split('\n')
   for name in namelist:
       if name in mcdict:
           print(mcdict[name])
           musicurl.append(mcdict[name])
       with open('musicurl.txt','a') as g:
           g.write(mcdict[name]+'\n')'''

#分析歌词
#1.用jieba分词
#2.用counter统计词频
#3.用wordcloud生成云词
#4.其他另加

#主函数
def main():
   PXS = {"https":"http://60.184.207.101:8998", #代理池
           "https":"http://115.220.2.116:808",
           "https":"http://60.246.85.35:8080"
       }
   musicurl = [] #存储筛选后音乐的链接
   mcdict = {}   #存储歌名和url对应关系的字典
   mcname = [] #存储歌名
   mcurl = [] #存储url
   kv = {'user-agent':'Mozilla/5.0'}
   #循环获取网页链接
   for i in range(46): #不要写成range(:46)
       url='http://www.kuwo.cn/artist/contentMusicsAjax?artistId=336&pn={}&rn=30'.format(i)
       musicHTML = getMusicListHTML(url,kv,PXS)
       parseListHTML(musicHTML,mcname,mcurl)
       print(len(mcname)) #检查一下歌名列表的长度
   baocun(mcname,mcurl,mcdict) #不要写到for循环里会迭代输出结果
   print('请筛选歌曲名')
   #getMusic(mcdict,musicurl)

main()
'''a=getMusicListHTML(url,kv)
with open('zjl2.txt','w') as f:
f.write(a)
f.close()'''

第二个文件筛选：

#去除伴奏歌曲
def shaixuan():
   with open('zjllist.txt','r') as f:
       namelist = f.read().split('\n')
       #print(namelist) #输出分割后的数据
   for name in namelist:
       if '伴奏' not in name:
           print(name)
           with open('zjllist2.txt','a') as g:
               g.write(name+'\n')
               g.close()
#去除铃声歌曲
def shaixuan2():
   with open('zjllist2.txt','r') as f:
       namelist = f.read().split('\n')
       #print(namelist) #输出分割后的数据
   for name in namelist:
       if '铃声' not in name:
           print(name)
           with open('zjllist3.txt','a') as g:
               g.write(name+'\n')
               g.close()
#去除 live版歌曲
def shaixuan3():
   with open('zjllist3.txt','r') as f:
       namelist = f.read().split('\n')
       #print(namelist) #输出分割后的数据
   for name in namelist:
       if 'Live' not in name:
           print(name)
           with open('zjllist4.txt','a') as g:
               g.write(name+'\n')
               g.close()
#找到合唱歌曲在下一个去()的操作的时候就已经把with这个包括了一块儿去掉了... 可以从hechangmc.txt中看一下这里的歌名到最后的列表里(zjllist_end.txt)查一下看看有没有对应的歌名如果没有可以加到最后的列表里(因为合唱不确定是不是周杰伦的新作品有可能是旧作品拿出来合唱的如果是新作品那最后的列表里肯定没有，然后再把这个新作品加到最后的列表里即可)
def shaixuan4():
   with open('zjllist4.txt','r') as f:
       namelist = f.read().split('\n')
       #print(namelist) #输出分割后的数据
   for name in namelist:
       if 'With' in name or 'with' in name:
           print(name)
           with open('hechangmc.txt','a') as g:
               g.write(name+'\n')
               g.close()
#去掉带 ()、（）、{}的歌名
def shaixuan5():
   with open('zjllist4.txt','r') as f:
       namelist = f.read().split('\n')
       #print(namelist) #输出分割后的数据
   for name in namelist:
       if '(' not in name and '（' not in name and '{' not in name and '[' not in name and '+' not in name and '-' not in name and '「' not in name and '《' not in name and '【' not in name:
           print(name)
           with open('zjllist5.txt','a') as g:
               g.write(name+'\n')
               g.close()

#去掉重名
def shaixuan6():
   namequchong = []
   with open('zjllist5.txt','r') as f:
       namelist = f.read().split('\n')
       #print(namelist) #输出分割后的数据
   for name in namelist:
       if name not in namequchong:
           print(name)
           namequchong.append(name)
           with open('zjllist6.txt','a') as g:
               g.write(name+'\n')
               g.close()
#挑出带空格的歌名还有带&的歌名保存到rengong.txt 其余的保存为zjllist7.txt
def shaixuan7():
   with open('zjllist6.txt','r') as f:
       namelist = f.read().split('\n')
       #print(namelist) #输出分割后的数据
   for name in namelist:
       if ' ' not in name and '&' not in name:
           print(name)
           with open('zjllist7.txt','a') as g:
               g.write(name+'\n')
               g.close()
       else:
           with open('rengong.txt','a') as h: #最后再从rengong.txt中找出带空格但是是正常歌曲的名字添加到最终的歌名列表zjllist_end.txt中
               h.write(name+'\n')
               h.close()

shaixuan()
shaixuan2()
shaixuan3()
shaixuan4()
shaixuan5()
shaixuan6()
shaixuan7()
print('请手动精确筛选，包括筛选zjllist7.txt(比如：周杰伦2016全新数字专辑酷我专属预告)和人工筛选hechangmc.txt、rengong.txt，筛选完成后创建zjllist_end.txt并把筛选后的内容保存进去')#保存完如果最后面有空行，记得把空行删除
print('完成上一步后执行setmcurl.py文件')

第三个文件创建链接：

#循环获取zjllist_end.txt文件中的歌名并从字典中取出对应歌名的url添加到musicurl.txt文件中，以便于生成完整的歌曲url 用于后续的歌名对应的歌曲的歌词爬取
def getMusicurl():
   with open('zjllist_end.txt','r') as f: #在使用这个函数之前要先执行shaixuan.py文件筛选歌名(中间涉及到人工手动筛选) 最后把结果保存到zjllist_end.txt中再调用这个函数
       namelist=f.read().split('\n')
   with open('yingshe.txt','r') as g:
       mcdict = eval(g.read()) #使用eval()函数把从文件中读出的数据转化为字典类型。
   for name in namelist:
       #捕获错误，例如如果name为''(空) 在访问字典里的值的时候会出现keyerror错误
       try:
           if name in mcdict:
               print(mcdict[name])
               #musicurl.append(mcdict[name])
           with open('musicurl.txt','a') as g:
               g.write(mcdict[name]+'\n')
       except:
           continue

#循环创建歌曲完整链接
def seturl():
   url_first = 'http://bd.kuwo.cn' #原始链接
   with open('musicurl.txt','r') as f:
       urllist = f.read().split('\n') #对读出的文件数据进行切片操作，以\n为分隔符，最后返回分割后的url列表
   for oneurl in urllist[:-1]: #最后一个是空行就不作为值参加输出了
       url = url_first + oneurl
       with open('complete_mc.txt','a') as g:
           g.write(url+'\n')
           g.close()
getMusicurl()
seturl()
print('执行music.py文件爬取歌词')

第四个文件爬取所有歌词：

#执行此程序之前别忘了在文件夹里创建一个名为music的文件夹！！！！！！！！！！
#爬取所有歌的歌词并保存到本地
import requests
from bs4 import BeautifulSoup
import bs4
import traceback
#import setmcurl
def openf():
   with open('complete_mc.txt','r') as f:
       urllist = f.read().split('\n')
       f.close()
   return urllist

'''#从urllist中获取url
def geturl(urllist):
for url in urllist:
yield url '''

#打开complete.txt文件

def getNanFangHTML(url,kv,PXS): #使用代理
#def getNanFangHTML(url,kv):
   try:
       r = requests.get(url, headers = kv, proxies = PXS, timeout=30) #使用代理
       #r = requests.get(url, headers = kv, timeout=30)
       r.raise_for_status()
       r.encoding = r.apparent_encoding
       return r.text #千万不要写成 return 'r.text'
   except:
       print('爬取失败')
       traceback.print_exc()
       return ""
def huoqugeci(html):
   try:
       gc =[]
       soup = BeautifulSoup(html, "html.parser")
       #获取歌名
       name_tag = soup.find_all(id='lrcName')
       name = name_tag[0].string
       #获取歌词
       geci = soup.find_all(id="llrcId")
       # .find_all 返回的是列表类型所以要用geci[0]来获取元素
       p = geci[0]('p')
       #print(p)
       #print(type(p))
       for ci in p:
           #ci 每一次代表一个p标签及内容
           gc.append(ci.string)
       #return gc
       return name,gc #等同于 return (name,gc) 返回一个元组
   except:
       return name,''
def printgeci(glist,fpath):
   gc2 = []
   for i in glist:
       print(i+'\n')
       gc2.append(i)
   f = open(fpath,'w',encoding='utf-8')
   f.write(str(gc2))
   #f.write(str(i)+'\n') 逐行输出，就不用gc2列表了
   f.close()

def main():
   #隐藏爬虫来源把来源设为浏览器，对应上面函数中的headers
   kv = {'user-agent':'Mozilla/5.0'}
   #代理池
   PXS = {"https":"http://60.184.207.101:8998",
           "https":"http://115.220.2.116:808",
           "https":"http://60.246.85.35:8080"
       }
   #gc =[]
   #url = 'http://bd.kuwo.cn/yinyue/7149583'
   #urllist = openf()
   #url = geturl(urllist)
   urllist = openf()
   for url in urllist[:-1]:
       try:
           print(url)
           html = getNanFangHTML(url,kv,PXS) #使用代理
           #html = getNanFangHTML(url,kv)
           name_gc = huoqugeci(html)
           fpath = 'H:/Sublime/bsxmxg/bsxm67/bsxm8/music/{}.txt'.format(name_gc[0])
           printgeci(name_gc[1],fpath)
       except:
           continue # 异常处理有错误就跳出循环(有些页面可能无法打开)
   print('下一步执行fenci.py')
main()

第五个文件歌词分析展示：

#分词然后统计最后生成词云
import os
import jieba
from collections import Counter
from wordcloud import WordCloud
from scipy.misc import imread

#jieba 分词
def fenci(all_words):
   for gequ in os.listdir('music'):
       with open('music/' + gequ, encoding = 'utf-8') as f:
           music = f.read()
           words = list(jieba.cut(music))
           all_words.extend(list(set(words))) #使用set()对每首歌的歌词分词结果进行去重。
           print(len(words))
           print(words)

#统计词频
def cipin(all_words):
   words_tongji = Counter(all_words)
   most_words = words_tongji.most_common()
   print(most_words)
   print('\n')
   print('\n')

   # 去除符号和助词、介词等
   # 这一步我们做了人工干预，手动选出一些忽略词
   most_words = [words for words in most_words if words[0] not in "，、。“”'（）！；[] ,‘’一片一张一句我用一天想要为什么无法不能这样不用打开叫做有人几个QQ我会因为谁会的我在你了是着：:都说就那周杰伦也再还被要方文山作词作曲为但吧啊与啦不有上只人给去里到才又已找该脸请我们没有什么知道一个开始怎么已经可以不会不是自己不要不到还是一直只是就是真的最后就算一样还有这么记得需要为了有点一种能够只有不想"]
   most_words2 = [words for words in most_words if len(words[0])>1 or words[0]=='爱']
   print(most_words2)
   print(len(most_words2))
   with open('cipin.txt','w',encoding='utf-8') as f:
       f.write(str(most_words2))
       f.close()
   return most_words2
#生成词云如果想单独执行词云程序需要把most_words2保存到txt文件中(保存的时候应该使用遍历most_words2的方法存储值并加入换行符例如 word + '\n'，这样从txt文件中读出的时候就可以用split(\n) 分割并生成列表以便于遍历取得数据)，再在另一个程序中从文件中读出即可
def ciyun(cipin_list):
   #生成词频字典
   cipin_dict = {}
   #cipin_dict2 = {}
   for word in cipin_list:
       cipin_dict[word[0]] = word[1]
   #cipin_dict2 = cipin_dict[:100] #字典没有切片操作
   pic = imread('zjlyt.png')
   wc = WordCloud(
       #因为显示中文，所以这里必须提供中文字体文件

       font_path='zhaozi.ttf',
       #显示词的最大个数
       max_words=200,
       #背景颜色
       background_color='white',
       #词云形状
       mask=pic,
       #图片宽度(设置mask属性后此属性被忽略，虽然形状不变但是字体大小会受影响)
       #width=600,
       #图片高度(设置mask属性后此属性被忽略)
       #height=400
       )
   wc.generate_from_frequencies(cipin_dict)

   wc.to_file('zjl11.jpg')