"""
Enrich Cordis with OpenAIRE
===========================
Tools for collecting OpenAIRE data (by Cordis project), and piping
to Neo4j.
"""
from bs4 import BeautifulSoup
import logging
import time
from py2neo import Node
[docs]def write_record_to_neo(record, output_type, graph):
'''A utility function, which takes record and writes it to neo4j graph
Args:
record (dict): a dictionary that contains metadata about a record
output_type(str): type of record to be extracted from OpenAIRE API. Accepts "software", "datasets", "publications", "ECProjects"
graph(graph_session): connection to neo4j database
'''
record_type = str(output_type).capitalize()
found_node = graph.nodes.match(output_type, pid=record['pid']).first()
if found_node is None:
created_node = Node(record_type, title=record['title'], pid=record['pid'])
graph.create(created_node)
return created_node
else:
logging.info("returning found node")
return found_node
[docs]def get_project_soups(currentUrl, reqsession, output_type, projectID):
''' Gets a beautiful soup according to output type and projectID
Args:
currentUrl(str): URL to OpenAIRE API
reqsession (instance of Requests session): currently open HTTP request
output_type(str): type of record to be extracted from OpenAIRE API. Accepts "software", "datasets", "publications", "ECProjects"
projectID(str): EC project identifier
Returns:
souplist(list): a list of BeautifulSoup objects that contain the results from API call
'''
response_size = -1
total = 9999
page = 1
pagesize = 100
souplist = list()
while (page + 1)*pagesize < total:
response = reqsession.get(currentUrl + output_type, params={'hasProject': 'true', 'projectID': projectID, 'size': pagesize, 'page': page})
if response.status_code == 200:
soup = BeautifulSoup(response.content, 'lxml')
#logging.info("soup retrieved for page %d" % page)
souplist.append(soup)
response.close()
if page == 1:
total = int(soup.find("total").text)
#logging.info("total: %d" % total)
page += 1
else:
logging.info(response.status_code)
logging.info("Service unavailable, waiting 10 seconds and trying again")
time.sleep(10)
continue
return souplist
[docs]def get_results_from_soups(souplist):
''' Extracts string from all BeautifulSoup objects and merges them into one list
Args:
souplist(list): a list of BeautifulSoup objects that contain the results from API call
Returns:
resultlist(list): a list of strings with results metadata
'''
resultlist = list()
for soup in souplist:
soupresults = soup.find_all("oaf:result")
resultlist = resultlist + soupresults
return resultlist
'''
def get_soup_contents(currentUrl, reqsession, output_type, resumption_token):
for x in range(0, 9):
#requests.get(url, params={'metadataPrefix':'oaf', 'set':output_type})
#pdb.set_trace()
if resumption_token == 'None' or resumption_token == 'First request':
response = reqsession.get(currentUrl, params={'verb': 'ListRecords', 'metadataPrefix': 'oaf', 'set': output_type})
logging.info("No resumptionToken")
else:
logging.info("resumptionToken is there ")
#response = reqsession.get(currentUrl, params={'verb': 'ListRecords', 'metadataPrefix': 'oaf', 'set': output_type, 'resumptionToken': resumption_token})
#resumptionToken as a parameter does not work for requests call, hence use string request
requeststr = currentUrl + '?verb=ListRecords&resumptionToken=' + resumption_token
logging.info(requeststr)
response = reqsession.get(requeststr)
if response.status_code == 200:
soup = BeautifulSoup(response.content, 'lxml')
response.close()
return soup
else:
logging.info(response.status_code)
logging.info("Service unavailable, waiting 10 seconds and trying again")
time.sleep(10)
continue
def add_linkages_to_neo(record, output_type, graph):
cypherquery = " MATCH (b:Project),(a:Dataset) WHERE a.id = %s AND ID(b) = %s CREATE (b)-[r:hasDataset]->(a) RETURN r" % (project_codes, record['id'])
graph.run(cypherquery)
def find_project_in_db(in_project_code, db_session):
records = db_session \
.query(openaire_orm.ECProjectRecord) \
.filter_by(project_code=in_project_code)
#pdb.set_trace()
try:
return records[0]
except IndexError:
return None
def write_records_to_db(records, output_type, db_session):
if output_type == "software":
is_software = True
else:
is_software = False
#iterate through records
for record in records:
#create object
record_obj = get_record_object(record, output_type)
#if software, find related EC projects and create relationship with related ECprojects via association table
if is_software:
record_obj = link_record_with_project(record, record_obj, db_session)
#add object into database
local_object = db_session.merge(record_obj)
db_session.add(local_object)
db_session.commit()
def get_record_object(cur_record, output_type):
if output_type == 'software':
return openaire_orm.SoftwareRecord(title=cur_record['title'], pid=cur_record['pid'], creators=str(cur_record['creators']) )
if output_type == 'ECProjects':
return openaire_orm.ECProjectRecord(title=cur_record['title'], project_code=cur_record['project_code'])
def parse_soft (cur_soup):
output_list = list()
results = cur_soup.find_all(re.compile("^oaf:result"))
return [{'project_codes': r.find('code'),
'pid': r.find('pid').text,
'title': r.find('title').text,
'creators': r.find_all('creators'),}
for r in results
if r.find_all('code')] #if code tag exists, then it is related to EC project
def parse_proj (cur_soup):
output_list = list()
results = cur_soup.find_all(re.compile("^oaf:project"))
return [{'title': r.find('title').text,
'project_code': r.find('code').text}
for r in results]
def parse_datasets (cur_soup):
output_list = list()
results = cur_soup.find_all(re.compile("^oaf:result"))
return [{'project_codes': r.find('code').text,
'title': r.find('title').text,
'pid': r.find('pid').text,}
for r in results
if r.find_all('code')] #if code tag exists, then it is related to EC project
def get_res_token(soup):
with open('current_soup.txt', 'w', encoding="utf-8") as f:
f.write(str(soup))
res_token = soup.find(re.compile("^oai:resumptiontoken"))
if res_token:
res_token_str = res_token.text
res_token_str = res_token_str.replace(' ', '%20')
res_token_str = res_token_str.replace('"', '%22')
return res_token_str
else:
return 'None'
'''