爬取漫画DB上的JoJo的奇妙冒险 第七部 飙马野郎

SBR是JOJO系列我最喜欢的一部,所以今天把漫画爬取到本地,日后慢慢看。

import re
import time
import requests
from requests import codes
from bs4 import BeautifulSoup
from requests import RequestException

def get_pageurl):
    try:
        headers = {'User-Agent': 'Mozilla/5.0 Windows NT 6.1; Win64; x64) AppleWebKit/537.36'
                   + 'KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36'}
        response = requests.geturl, headers=headers)
        if response.status_code == 200:
            return response.text
        return None
    except RequestException:
        return None

def get_pagesNumbertext):
    soup = BeautifulSouptext, 'lxml')
    pagesNumber = soup.findname='div', class_="d-none vg-r-data")
    return pagesNumber.attrs['data-total'] 
    
def parse_pagetext):
    soup = BeautifulSouptext, 'lxml')
    url = soup.findname='img', class_="img-fluid show-pic")
    chapter = soup.findname='h2', class_="h4 text-center")
    page = soup.findname='span', class_="c_nav_page")
    yield {
        'url': url['src'],
        'chapter': chapter.get_text),
        'page': page.get_text)
    }
#return 在返回结果后 结束函数的运行
#而yield 则是让函数变成一个生成器,生成器每次产生一个值,函数被冻结,被唤醒后再产生一个值
    
    
def save_imageitem):
    img_path = 'SBR' + os.path.sep + item.get'chapter') #os.path.sep是路径分隔符
    if not os.path.existsimg_path):
        os.makedirsimg_path)
    try:
        resp = requests.getitem.get'url'))
        if codes.ok == resp.status_code:
            file_path = img_path + os.path.sep + '{file_name}.{file_suffix}'.format
                file_name=item.get'page'), file_suffix='jpg')
            if not os.path.existsfile_path):
                with openfile_path, 'wb') as f:
                    f.writeresp.content)
                print'Downloaded image path is %s' % file_path)
            else:
                print'Already Downloaded', file_path)
    except Exception as e:
        printe)

if __name__ == '__main__':
    for chapter in range292, 316): #观察可发现共24章节,292到315 彩漫13283, 13306
        url = 'https://www.manhuadb.com/manhua/147/4_'+strchapter)+'.html'
        text = get_pageurl) 
        pagesNumber = get_pagesNumbertext) #获取当前章节总页数
        for page in range1,intpagesNumber)+1):
            url = 'https://www.manhuadb.com/manhua/147/4_'+strchapter)+'_'+strpage)+'.html'
       #彩漫#url = 'https://www.manhuadb.com/manhua/147/1330_'+strchapter)+'_'+strpage)+'.html'
            text = get_pageurl)
            for item in parse_pagetext):
                save_imageitem)

 最后得到,

Published by

风君子

独自遨游何稽首 揭天掀地慰生平

发表回复

您的电子邮箱地址不会被公开。 必填项已用 * 标注