爬取碧蓝航线wiki

时间：2019-03-16 16:46:40 收藏：0 阅读：186
经常上这个网站查找角色信息什么的而且非常想把喜欢的角色全部信息立绘什么的下载下来但是角色又太多所以想到了爬虫

1、对角色进行阵营分类，便于自己查找
2、对角色台词信息下载(个人爱好)
3、对角色正常立绘和Q版立绘下载
上代码:
import requests
import re
import os
import time

#获取阵营名字
def get_camp(html):
    patten = re.findall(r‘<ul><li><b><a href=".*?" title=".*?">(.*?)</a></b>.*?</li></ul>‘,html,re.S)
    return patten

#创建相对于的阵营的文件夹
def make_camp_file(list1):
    camp_path = []
    for camp_name in list1:
        file_path = "C:\\Users\\16609\\Desktop\\blhxS\\" + camp_name[0]
        camp_path.append(file_path)
        if not os.path.exists(file_path):
            os.makedirs(file_path)
    return camp_path

#按阵营分类提取出对应的角色的url
def get_Char_url(html):
    Char_url = []
    patten = re.findall(r‘<p><a href="(.*?)" title=".*?">‘,html)
    for url in patten:
        url = url.replace(‘&#58;‘,‘:‘)
        Char_url.append(url)
    return Char_url

#人物Q版立绘
def get_Q_Pic(response,path):
    demo = re.findall(r‘<div class="qchar-container" data-ship-name="(.*?)">(.*?)</div>‘, response.text, re.S)
    name = demo[0][0]
    Q_Lipainted = demo[0][1]
    Q_name = re.findall(r‘alt="(.*?)"‘, Q_Lipainted)
    Q_url = re.findall(r‘src="(.*?)"‘, Q_Lipainted)
    Name_Url = dict(zip(Q_name, Q_url))
    for i in Name_Url:
        url = Name_Url[i]
        Resp_pic = requests.get(url)
        Q_img = Resp_pic.content
        with open(path + i,‘wb‘) as f:
            f.write(Q_img)


# 人物信息
def get_info(response,path,file_name):
    Info = re.findall(r‘<tr data-key=".*?">(.*)</tr>‘, response.text, re.S)
    Info_Speak = re.findall(r‘<th>(.*?)</th>.*?data-lang="zh">.(.*?)</p>‘, Info[0], re.S)
    Info_Speak_dict = {}
    for i in Info_Speak:
        Info_Speak_dict[i[0]] = i[1]
    for i in Info_Speak_dict:
        if not os.path.exists(path):
            os.makedirs(path)
        with open(path + file_name + ‘.txt‘,‘a+‘,encoding=‘utf-8‘) as f:
            cont = i + ":" + Info_Speak_dict[i] + ‘\n‘
            f.write(cont)

#获取每个角色网页的html
def get_html(char_url,path):
    for url in char_url:
        response = Session.get(url)
        Name = re.findall(r‘http://wiki.joyme.com/blhx/(.+)‘,url)
        file_path = path + ‘\\‘ + Name[0] + ‘\\‘
        get_info(response,file_path,Name[0])
        get_pic(response,file_path)
        get_Q_Pic(response,file_path)

#人物图片
def get_pic(response,path):
    Pic = re.findall(r‘<div class="tab_con.*?" style=".*?">.*?<img alt="(.*?)" src="(.*?)".*?</div>‘, response.text,re.S)
    Pic_dict = {}
    for i in Pic:
        i = list(i)
        if i[0] == ‘‘:
            i[0] = ‘Q_GIF.gif‘
        Pic_dict[i[0]] = i[1]
    for i in Pic_dict:
        url = Pic_dict[i]
        LB_pic_resp = requests.get(url)
        LB_pic = LB_pic_resp.content
        with open(path + i,‘wb‘) as f:
            f.write(LB_pic)

#碧蓝航线wiki阵营分类网页
url = ‘http://wiki.joyme.com/blhx/%E8%88%B0%E5%A8%98%E5%9B%BE%E9%89%B4‘
Session = requests.session()
response = Session.get(url)

#将网页以阵营分类进行截取
camp = re.split(‘<img alt="分割线.png"‘,response.text)
list1 = []
for camp_name in camp:
    #获取阵营的名字 并将阵营名放入list1列表中
    s = get_camp(camp_name)
    if s != []:
        list1.append(s)

# 将阵营list列表传入创建对应名称的文件夹
camp_path = make_camp_file(list1)
# 去掉分割首页html中的第一段  因为这一段为网页顶部代码  没有人物名称及对应url
camp.remove(camp[0])
camp_char = []
for i in camp:
    Char_url = get_Char_url(i)
    camp_char.append(Char_url)

i = 0
for url in camp_char:
    get_html(url,camp_path[i])
    i = i + 1