谷歌学术搜索文献_谷歌学术论文翻译谷歌浏览器Chrome最新版下载方式

两个py文件

起主要作用的Search&Download.py

# -*- coding: utf-8 -*-

import requests
from bs4 import BeautifulSoup
from Download import Hubber
import xlwt,os
from time import sleep
from tqdm import tqdm


TotalNum=0
class Articleobject):
    title = ""
    article_link = ""
    authors = ""
    authors_link = ""
    abstract = ""
    def __init__self):
        title = "New Paper"

def save_xlssheet, paper):
    # 将数据按列存储入excel表格中
    global TotalNum
    sheet.writeTotalNum, 0, TotalNum)
    sheet.writeTotalNum, 1, paper.title)
    sheet.writeTotalNum, 2, paper.article_link)
    sheet.writeTotalNum, 3, paper.journal)
    sheet.writeTotalNum, 4, paper.authors_link)
    sheet.writeTotalNum, 5, paper.abstract)
    TotalNum += 1

head = { \
        'user-agent': 'Mozilla/5.0 Windows NT 10.0; Win64; x64) AppleWebKit/537.36 KHTML, like Gecko) Chrome/79.0.3945.117 Safari/537.36' \
        }  # 20210607更新,防止HTTP403错误
article_titles = []
article_links = []

def GetInfosheet,url):
  r = requests.geturl, headers=head)
  r.raise_for_status)
  r.encoding = r.apparent_encoding
  soup = BeautifulSoupr.text, "html.parser")
  #print"\n"+soup)
  articles = soup.find_allclass_="gs_ri")
  for article in articles:
      paper =Article)
      try:
          title = article.find'h3')
          paper.title = title.text
          #print"\n"+paper.title)
          article_titles.appendpaper.title)
          paper.article_link = title.a.get'href')
          #print"\n"+paper.article_link)
          article_links.appendpaper.article_link)

          journal = article.findclass_="gs_a")
          paper.journal =journal.text
          #print"\n"+paper.authors)
          authors_addrs = journal.find_all'a')
          for authors_addr in authors_addrs:
              #print"\n"+authors_addr.get'href'))
              paper.authors_link=paper.authors_link +authors_addr.get'href'))+"\n"

          abstract = article.findclass_="gs_rs")
          paper.abstract = abstract.text
          #print"\n"+paper.abstract)
      except:
          continue
      save_xlssheet,paper)
  return


def getArticlearticle_titles,article_links):
    dir = ".\\Articles\\" +keyword +"\\"
    #print dir)
    if os.path.existsdir) == False:
        os.mkdirdir)
    for k in tqdmrangelenarticle_titles))):
        article_titles[k]="{0}".formatarticle_titles[k].replace':', ' ')).replace'.', '')
        path = dir + article_titles[k] + ".pdf"
        #print"\n"+path)
        try:
            Hubber.getPDFarticle_links[k],path)
            sleep0.5)
        except:
            continue

if __name__ == '__main__':
    myxls = xlwt.Workbook)
    sheet1 = myxls.add_sheetu'PaperInfo', True)
    column = ['序号', '文章题目','文章链接','期刊', '作者链接', '摘要']
    for i in range0, lencolumn)):
        sheet1.writeTotalNum, i, column[i])
    TotalNum+=1

    keyword=input"keywords is?\n")
    #keyword = diabetes and conjunctiva and microcirculation or microvasculature)
    #print"\n"+keyword)
    key = keyword.replace" ","+")
    info = keyword + "_PaperInfo.xls"

    print"\n"+"检索中……")
    if os.path.existsinfo) == True:
        print"\n" + "PaperInfo already exists!")
    else:
        start = 0
        for i in tqdmrange10)):
            url = 'https://xs.dailyheadlines.cc/scholar?start=' + strstart) + '&q=' + key + '&hl=zh-CN&as_sdt=0,5'
            start = start + 10
            GetInfosheet1,url)
            myxls.savekeyword+'_PaperInfo.xls')
            sleep0.5)
    print"\n"+"检索完成")

    print"\n"+"下载中……")
    if lenarticle_titles) != 0:
        getArticlearticle_titles, article_links)
    else:
        import xlrd
        data = xlrd.open_workbookinfo)
        table = data.sheet_by_index0)
        article_titles = table.col_values1)[1:]
        article_links = table.col_values2)[1:]
        #print"\n"+article_titles)
        #print"\n"+article_links)
        getArticlearticle_titles, article_links)
    print"\n"+ "下载完成")

起辅助作用的Download.py,可将更多网站补充进去! 

import os.path
import re
import requests
from bs4 import BeautifulSoup

class Hubber:
    head = { \
        'user-agent': 'Mozilla/5.0 Windows NT 10.0; Win64; x64) AppleWebKit/537.36 KHTML, like Gecko) Chrome/79.0.3945.117 Safari/537.36' \
        }  # 20210607更新,防止HTTP403错误

    def pdf_huburl,path):
        try:
            pdf = requests.geturl, headers=Hubber.head)
            with openpath, "wb") as f:
                f.writepdf.content)
            print"\n"+"pdf found directly!")
        except:
            print"\n"+"failed to download pdf directly!\n" +url)
            Hubber.err_logurl)
    def sci_hubpath,doi):
        doi = strdoi).split"https://doi.org/")[1]
        url = "https://www.sci-hub.ren/doi:" + doi + "#"
        r = requests.geturl, headers=Hubber.head)
        r.raise_for_status)
        r.encoding = r.apparent_encoding
        soup = BeautifulSoupr.text, "html.parser")
        download_url = soup.iframe.attrs["src"]
        try:
            download_r = requests.getdownload_url, headers=Hubber.head)
            download_r.raise_for_status)
            with openpath, "wb+") as temp:
                temp.writedownload_r.content)
                print"\n"+"Article downloaded by doi!")
        except:
            print"\n"+"failed to download pdf by doi!\n" +url)
            Hubber.err_logurl)

    def err_logurl):
        with open"download_err.txt", "a+", encoding="utf-8") as error:
            error.write"PDF not found,download link may be: \n"+url +"\n")

    def getSoupurl):
        r = requests.geturl, headers=Hubber.head)
        r.raise_for_status)
        r.encoding = r.apparent_encoding
        soup = BeautifulSoupr.text, "html.parser")
        return soup

    def getPDFurl,path):
        if os.path.existspath) == True:
            print"\n" + "Article already exists")
        else:
            if lenre.findall'pdf', url)) != 0):
                print "\n"+'pdf link already!')
                Hubber.pdf_huburl,path)
            elif re.match"https://www.sci-hub.ren/",url):
                print"\n" + 'sci_hub link!')
                url = strurl).replace"https://www.sci-hub.ren/","https://doi.org/")
                Hubber.sci_hubpath,url)
            #if pdf can be easily found!
            elif re.match"https://academic.oup.com/", url):
                soup = Hubber.getSoupurl)
                pdf_link ="https://academic.oup.com"+soup.findclass_="al-link pdf article-pdfLink").get'href')
                #print"\n"+pdf_link)
                Hubber.pdf_hubpdf_link,path)
                '''
                doi = soup.select'div[class="ww-citation-primary"]')[0].a.get'href')
                #print"\n"+doi)
                Hubber.sci_hubpath,doi)
                '''
            elif re.match"https://content.iospress.com/", url):
                soup = Hubber.getSoupurl)
                pdf_link = soup.findclass_="btn btn-download btn-right get-pdf").get'href')
                # print"\n"+pdf_link)
                Hubber.pdf_hubpdf_link, path)
            elif re.match"https://wwwnature.53yu.com/", url):
                soup = Hubber.getSoupurl)
                pdf_link = soup.findclass_="c-pdf-download__link").get'href')
                #print"\n"+pdf_link)
                Hubber.pdf_hubpdf_link, path)
            elif re.match"https://bjo.bmj.com/", url):
                soup = Hubber.getSoupurl)
                pdf_link = soup.findclass_="article-pdf-download").get'href')
                pdf_link = "https://bjo.bmj.com" + pdf_link
                #print"\n"+pdf_link)
                Hubber.pdf_hubpdf_link,path)
            elif re.match"https://jamanetwork.com/", url):
                soup = Hubber.getSoupurl)
                pdf_link = soup.findclass_="toolbar-tool toolbar-pdf al-link pdfaccess").get'data-article-url')
                pdf_link = "https://jamanetwork.com" + pdf_link
                #print"\n"+pdf_link)
                Hubber.pdf_hubpdf_link, path)

            #if pdf can't be easily found,but doi can!
            elif re.match"https://sciencedirect.53yu.com/", url):
                soup = Hubber.getSoupurl)
                doi = soup.findclass_="doi").get'href')
                Hubber.sci_hubpath, doi)
            elif re.match"https://diabetes.diabetesjournals.org/", url):
                soup = Hubber.getSoupurl)
                doi = soup.select'.citation-doi')[0].a.get'href')
                Hubber.sci_hubpath, doi)
            elif re.match"https://journals.lww.com/", url):
                soup = Hubber.getSoupurl)
                doi = "https://doi.org/" + strsoup.findid="ej-journal-doi").text).split"doi: ")[1]
                Hubber.sci_hubpath, doi)
            else:
                '''
                https://europepmc.org/
                https://iovs.arvojournals.org/
                https://linkspringer.53yu.com/
                '''
                print"\n"+"To be prettified!Download link may be: " +"\n" +url)
                Hubber.err_logurl)

if __name__ == '__main__' :
    url = "https://www.nature.com/articles/s41598-021-87315-7.pdf"
    url1 = "https://www.sci-hub.ren/doi:10.1067/mva.2003.139#"
    url2 = "https://www.sci-hub.ren/doi:10.1067/mva.2003.139#"
    Hubber.getPDFurl,"test.pdf")
    Hubber.getPDFurl1,"test1.pdf")
    Hubber.getPDFurl2,"test2.pdf")

Published by

风君子

独自遨游何稽首 揭天掀地慰生平

发表回复

您的邮箱地址不会被公开。 必填项已用 * 标注