SBR是JOJO系列我最喜欢的一部,所以今天把漫画爬取到本地,日后慢慢看。
import re import time import requests from requests import codes from bs4 import BeautifulSoup from requests import RequestException def get_pageurl): try: headers = {'User-Agent': 'Mozilla/5.0 Windows NT 6.1; Win64; x64) AppleWebKit/537.36' + 'KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36'} response = requests.geturl, headers=headers) if response.status_code == 200: return response.text return None except RequestException: return None def get_pagesNumbertext): soup = BeautifulSouptext, 'lxml') pagesNumber = soup.findname='div', class_="d-none vg-r-data") return pagesNumber.attrs['data-total'] def parse_pagetext): soup = BeautifulSouptext, 'lxml') url = soup.findname='img', class_="img-fluid show-pic") chapter = soup.findname='h2', class_="h4 text-center") page = soup.findname='span', class_="c_nav_page") yield { 'url': url['src'], 'chapter': chapter.get_text), 'page': page.get_text) } #return 在返回结果后 结束函数的运行 #而yield 则是让函数变成一个生成器,生成器每次产生一个值,函数被冻结,被唤醒后再产生一个值 def save_imageitem): img_path = 'SBR' + os.path.sep + item.get'chapter') #os.path.sep是路径分隔符 if not os.path.existsimg_path): os.makedirsimg_path) try: resp = requests.getitem.get'url')) if codes.ok == resp.status_code: file_path = img_path + os.path.sep + '{file_name}.{file_suffix}'.format file_name=item.get'page'), file_suffix='jpg') if not os.path.existsfile_path): with openfile_path, 'wb') as f: f.writeresp.content) print'Downloaded image path is %s' % file_path) else: print'Already Downloaded', file_path) except Exception as e: printe) if __name__ == '__main__': for chapter in range292, 316): #观察可发现共24章节,292到315 彩漫13283, 13306 url = 'https://www.manhuadb.com/manhua/147/4_'+strchapter)+'.html' text = get_pageurl) pagesNumber = get_pagesNumbertext) #获取当前章节总页数 for page in range1,intpagesNumber)+1): url = 'https://www.manhuadb.com/manhua/147/4_'+strchapter)+'_'+strpage)+'.html' #彩漫#url = 'https://www.manhuadb.com/manhua/147/1330_'+strchapter)+'_'+strpage)+'.html' text = get_pageurl) for item in parse_pagetext): save_imageitem)
最后得到,