Uzyskiwanie danych z PubMed przy użyciu Pythona

3

Wow, byłem w pracy na podobnym projekcie sobie zaledwie tydzień temu!

Edytuj: Niedawno zaktualizowałem kod, aby skorzystać z BeautifulSoup. Mam dla niego swój własny virtualenv, ale można go zainstalować za pomocą pip.

Zasadniczo mój program pobiera identyfikator paszy, DOI lub plik tekstowy z wierszami o zawiłych identyfikatorach i/lub identyfikatorach DOI i zbiera informacje o artykule. To może być łatwo manipulowane do własnych potrzeb, aby uzyskać abstrakcyjny, ale tutaj jest mój kod:

import re 
import sys 
import traceback 
from bs4 import BeautifulSoup 
import requests 

class PubMedObject(object): 
    soup = None 
    url = None 

    # pmid is a PubMed ID 
    # url is the url of the PubMed web page 
    # search_term is the string used in the search box on the PubMed website 
    def __init__(self, pmid=None, url='', search_term=''): 
     if pmid: 
      pmid = pmid.strip() 
      url = "http://www.ncbi.nlm.nih.gov/pubmed/%s" % pmid 
     if search_term: 
      url = "http://www.ncbi.nlm.nih.gov/pubmed/?term=%s" % search_term 
     page = requests.get(url).text 
     self.soup = BeautifulSoup(page, "html.parser") 

     # set the url to be the fixed one with the PubMedID instead of the search_term 
     if search_term: 
      try: 
       url = "http://www.ncbi.nlm.nih.gov/pubmed/%s" % self.soup.find("dl",class_="rprtid").find("dd").text 
      except AttributeError as e: # NoneType has no find method 
       print("Error on search_term=%s" % search_term) 
     self.url = url 

    def get_title(self): 
     return self.soup.find(class_="abstract").find("h1").text 

    #auths is the string that has the list of authors to return 
    def get_authors(self): 
     result = [] 
     author_list = [a.text for a in self.soup.find(class_="auths").findAll("a")] 
     for author in author_list: 
      lname, remainder = author.rsplit(' ', 1) 
      #add periods after each letter in the first name 
      fname = ".".join(remainder) + "." 
      result.append(lname + ', ' + fname) 

     return ', '.join(result) 

    def get_citation(self): 
     return self.soup.find(class_="cit").text 

    def get_external_url(self): 
     url = None 
     doi_string = self.soup.find(text=re.compile("doi:")) 
     if doi_string: 
      doi = doi_string.split("doi:")[-1].strip().split(" ")[0][:-1] 
      if doi: 
       url = "http://dx.doi.org/%s" % doi 
     else: 
      doi_string = self.soup.find(class_="portlet") 
      if doi_string: 
       doi_string = doi_string.find("a")['href'] 
       if doi_string: 
        return doi_string 

     return url or self.url 

    def render(self): 
     template_text = '' 
     with open('template.html','r') as template_file: 
      template_text = template_file.read() 

     try: 
      template_text = template_text.replace("{{ external_url }}", self.get_external_url()) 
      template_text = template_text.replace("{{ citation }}", self.get_citation()) 
      template_text = template_text.replace("{{ title }}", self.get_title()) 
      template_text = template_text.replace("{{ authors }}", self.get_authors()) 
      template_text = template_text.replace("{{ error }}", '') 
     except AttributeError as e: 
      template_text = template_text.replace("{{ external_url }}", '') 
      template_text = template_text.replace("{{ citation }}", '') 
      template_text = template_text.replace("{{ title }}", '') 
      template_text = template_text.replace("{{ authors }}", '') 
      template_text = template_text.replace("{{ error }}", '<!-- Error -->') 

     return template_text.encode('utf8') 

def start_table(f): 
    f.write('\t\t\t\t\t\t\t\t\t<div class="resourcesTable">\n'); 
    f.write('\t\t\t\t\t\t\t\t\t\t<table border="0" cellspacing="0" cellpadding="0">\n'); 

def end_table(f): 
    f.write('\t\t\t\t\t\t\t\t\t\t</table>\n'); 
    f.write('\t\t\t\t\t\t\t\t\t</div>\n'); 

def start_accordion(f): 
    f.write('\t\t\t\t\t\t\t\t\t<div class="accordion">\n'); 

def end_accordion(f): 
    f.write('\t\t\t\t\t\t\t\t\t</div>\n'); 

def main(args): 
    try: 
     # program's main code here 
     print("Parsing pmids.txt...") 
     with open('result.html', 'w') as sum_file: 
      sum_file.write('<!--\n') 
     with open('pmids.txt','r') as pmid_file: 
     with open('result.html','a') as sum_file: 
     for pmid in pmid_file: 
      sum_file.write(pmid) 
     sum_file.write('\n-->\n') 
     with open('pmids.txt','r') as pmid_file: 
      h3 = False 
      h4 = False 
      table_mode = False 
      accordion_mode = False 
      with open('result.html', 'a') as sum_file: 
       for pmid in pmid_file: 
        if pmid[:4] == "####": 
         if h3 and not accordion_mode: 
          start_accordion(sum_file) 
          accordion_mode = True 
         sum_file.write('\t\t\t\t\t\t\t\t\t<h4><a href="#">%s</a></h4>\n' % pmid[4:].strip()) 
         h4 = True 
        elif pmid[:3] == "###": 
         if h4: 
          if table_mode: 
           end_table(sum_file) 
           table_mode = False 
          end_accordion(sum_file) 
          h4 = False 
          accordion_mode = False 
         elif h3: 
          end_table(sum_file) 
          table_mode = False 
         sum_file.write('\t\t\t\t\t\t\t\t<h3><a href="#">%s</a></h3>\n' % pmid[3:].strip()) 
         h3 = True       
        elif pmid.strip(): 
         if (h3 or h4) and not table_mode: 
          start_table(sum_file) 
          table_mode = True 
         if pmid[:4] == "http": 
          if pmid[:18] == "http://dx.doi.org/": 
           sum_file.write(PubMedObject(search_term=pmid[18:]).render()) 
          else: 
           print("url=%s" % pmid) 
           p = PubMedObject(url=pmid).render() 
           sum_file.write(p) 
           print(p) 
         elif pmid.isdigit(): 
          sum_file.write(PubMedObject(pmid).render()) 
         else: 
          sum_file.write(PubMedObject(search_term=pmid).render()) 
       if h3: 
        if h4: 
         end_table(sum_file) 
         end_accordion(sum_file) 
        else: 
         end_table(sum_file) 
      pmid_file.close() 
     print("Done!") 

    except BaseException as e: 
     print traceback.format_exc() 
     print "Error: %s %s" % (sys.exc_info()[0], e.args) 
     return 1 
    except: 
     # error handling code here 
     print "Error: %s" % sys.exc_info()[0] 
     return 1 # exit on error 
    else: 
     raw_input("Press enter to exit.") 
     return 0 # exit errorlessly 

if __name__ == '__main__': 
    sys.exit(main(sys.argv))

Teraz zwraca plik HTML w oparciu o informacje on pobrany. Oto szablon.txt:

<tr>{{ error }} 
    <td valign="top" class="resourcesICO"><a href="{{ external_url }}" target="_blank"><img src="/image/ico_sitelink.gif" width="24" height="24" /></a></td> 
    <td><a href="{{ external_url }}">{{ title }}</a><br /> 
    {{ authors }}<br /> 
    <em>{{ citation }}</em></td> 
</tr>

Po uruchomieniu program poprosi o podanie DOI lub Pubmed ID. Jeśli go nie podasz, przeczytasz pmids.txt.Feel będzie mógł swobodnie korzystać z kodu zgodnie z Twoim wyobrażeniem.

Źródło

2013-07-01 16:34:03 Bobort

+0

Dziękuję Bobort, mam zamiar podkręcić ten kod tak, że po prostu staje się abstrakcyjną info. Będę także integrował to z innym skryptem, który odwzorowuje wypaczony identyfikator na tytuł strukturalny i tytuł cytatu. –

+0

Dlaczego otrzymałem głos w dół? Jak mało pomocne, aby zagłosować, odpowiedzieć i odejść! – Bobort

+0

Cześć Bobort, myślę, że ktoś inny nie głosował na odpowiedź. Naprawię to dla ciebie. –

1

PubMed artykuły mają postać: http://www.ncbi.nlm.nih.gov/pubmed/?Id

Jeśli znasz identyfikator następnie można pobrać wyżej i będziesz miał dostęp do artykułu. Skrót jest zawarty w strukturze jak:

<div class="abstr"><h3>Abstract</h3><div class=""><p>α-latrotoxin and snake presynaptic phospholipases A2 neurotoxins target the presynaptic membrane of axon terminals of the neuromuscular junction....</p></div></div>

Będziesz wtedy potrzebne jest narzędzie do wyodrębnienia tego. Sugerowałbym użycie: http://www.crummy.com/software/BeautifulSoup/bs4/doc/

Nadal będziesz potrzebować narzędzia, które faktycznie pobierze HTML. Do tego użyłbym phantom.js lub popularnego modułu zapytań.

Twój workflow chciałby coś takiego:

pubmed_ids [1,2,3] 
abstracts = [] 

for id in pubmed_ids: 
html_for_id = requests.get('http://www.ncbi.nlm.nih.gov/pubmed/{0}'.format(id)) 
soup = BeautifulSoup(html_for_id) 
abstract = soup.find('selector for abstract') 
abstracts.append(abstract)

Źródło

2013-07-01 16:30:38 bearrito

19

Moduł o nazwie Entrez można łatwo uzyskać streszczenie wraz ze wszystkimi innymi metadanymi. Będzie to wydrukować streszczenie:

from Bio.Entrez import efetch 

def print_abstract(pmid): 
    handle = efetch(db='pubmed', id=pmid, retmode='text', rettype='abstract') 
    print handle.read()

I tu jest funkcją, która pobiera XML i wrócić tylko streszczenie:

from Bio.Entrez import efetch, read 

def fetch_abstract(pmid): 
    handle = efetch(db='pubmed', id=pmid, retmode='xml') 
    xml_data = read(handle)[0] 
    try: 
     article = xml_data['MedlineCitation']['Article'] 
     abstract = article['Abstract']['AbstractText'][0] 
     return abstract 
    except IndexError: 
     return None

PS: Rzeczywiście miałem potrzebę robienia tego rodzaju rzeczy w prawdziwym zadaniu, więc zorganizowałem kod w klasę - see this gist.

Źródło

2013-11-22 16:39:54 Karol

+1

To wygląda na bardzo ładny moduł. Nie miałem pojęcia, że to istnieje. Jednak jedną dobrą rzeczą w moim kodzie jest to, że uzyskuje wartości DOI, dzięki czemu uzyskany adres URL jest jak najbardziej ogólny. Zakładam, że takie funkcje mogą istnieć w module Entrez, ale nie zagłębiłem się w to. – Bobort

+0

Nie jestem pewien, co masz na myśli przez adres URL ... biopython wykonuje wszystkie zapytania za kulisami, więc nie musisz grać z żadnymi adresami URL. – Karol

+0

Jest w porządku. Moja aplikacja tworzy "http://dx.doi.org/", dzięki czemu mogę go używać na stronie internetowej. Zamiast iść do rzeczy PubMed, chcę przejść bezpośrednio do artykułu. Najbardziej ogólnym sposobem, jaki znam teraz, który jest przyjazny programistom, jest użycie schematu DOI. – Bobort

0

Wydaje moduł „wzór” Można to zrobić w prosty sposób:

from pattern import web 
 
import requests 
 

 
id = 27523945 
 
url = "http://www.ncbi.nlm.nih.gov/pubmed/{0}".format(id) 
 
page = requests.get(url).text.encode('ascii', 'ignore') 
 
dom = web.Element(page) 
 
print(dom.by_tag("abstracttext")[0].content)

Źródło

2016-08-17 14:33:35

Uzyskiwanie danych z PubMed przy użyciu Pythona

Odpowiedz

Powiązane problemy