Skip to content

Commit

Permalink
Merge pull request #12 from ror-community/202407-data-update
Browse files Browse the repository at this point in the history
Merge data update and bug fixes to main
  • Loading branch information
adambuttrick authored Jul 19, 2024
2 parents b156846 + 2db4ae6 commit 5e50a6c
Show file tree
Hide file tree
Showing 15 changed files with 46,961 additions and 37,673 deletions.
6,264 changes: 3,269 additions & 2,995 deletions data/aggregate_mapped.csv

Large diffs are not rendered by default.

17,083 changes: 8,460 additions & 8,623 deletions data/aggregate_unmapped.csv

Large diffs are not rendered by default.

45,655 changes: 22,883 additions & 22,772 deletions data/crossref_funder_work_counts.csv

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion data/crossref_funders.json

Large diffs are not rendered by default.

Binary file modified data/crossref_overlap.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
4,347 changes: 2,229 additions & 2,118 deletions data/datacite_funder_work_counts.csv

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion data/datacite_funders.json

Large diffs are not rendered by default.

Binary file modified data/datacite_overlap.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
3,671 changes: 2,942 additions & 729 deletions data/funders.json

Large diffs are not rendered by default.

7,494 changes: 7,100 additions & 394 deletions data/members.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion data/ror_funder_registry_mapping.json

Large diffs are not rendered by default.

6 changes: 3 additions & 3 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,9 @@
"DataCite - Aggregrate overlap": DataCite_view
}

funder_registry_version = '1.56'
ror_registry_version = '1.42'
works_count_date = '2024/03/10'
funder_registry_version = '1.58'
ror_registry_version = '1.49'
works_count_date = '2024/07/16'

def main():
sidebar_title = st.sidebar.title("Views")
Expand Down
32 changes: 14 additions & 18 deletions utilities/create_funder_id_mapping_w_dl.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,6 @@
import os


import os
import requests
import zipfile

def download_and_unzip(record_id, path='.'):
# Downloading the record from Zenodo
response = requests.get(f'https://zenodo.org/api/records/{record_id}')
Expand All @@ -30,21 +26,21 @@ def download_and_unzip(record_id, path='.'):
return None



def create_mapping_and_output_json(ror_data_file, json_output_file):
mapping = {}
with open(ror_data_file, 'r+') as f_in:
ror_data = json.load(f_in)
for item in ror_data:
ror_id = item.get('id', '')
funder_id_all = item.get('external_ids', {}).get('FundRef', {}).get('all', [])
if funder_id_all:
funder_id_preferred = item.get('external_ids', {}).get('FundRef', {}).get('preferred')
if funder_id_preferred:
funder_id_all.append(funder_id_preferred)
funder_id_all = list(set(funder_id_all))
for funder_id in funder_id_all:
mapping[funder_id] = ror_id
with open(ror_data_file, 'r', encoding="utf8") as f_in:
ror_data_list = json.load(f_in)
for ror_data in ror_data_list:
ror_id = ror_data.get('id', '')
external_ids = ror_data.get('external_ids', [])
for external_id in external_ids:
if external_id.get('type') == 'fundref':
funder_ids = external_id.get('all', [])
if external_id.get('preferred'):
funder_ids.append(external_id['preferred'])
funder_ids = list(set(funder_ids))
for funder_id in funder_ids:
mapping[funder_id] = ror_id
with open(json_output_file, 'w') as json_file:
json.dump(mapping, json_file)

Expand All @@ -64,7 +60,7 @@ def delete_files(prefix):
# Record ID for ROR data dumps
record_id = input('Record ID for ROR data dump: ')
prefix = download_and_unzip(record_id)
json_file_path = f'{prefix}.json'
json_file_path = f'{prefix}_schema_v2.json'
if os.path.exists(json_file_path):
outfile = 'ror_funder_registry_mapping.json'
create_mapping_and_output_json(json_file_path, outfile)
Expand Down
26 changes: 18 additions & 8 deletions utilities/get_crossref_funder_work_counts.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,23 @@
import requests


def catch_requests_exceptions(func):
def wrapper(*args, **kwargs):
try:
return func(*args, **kwargs)
except requests.exceptions.RequestException:
return 'Error'
return wrapper
def catch_request_exceptions(max_retries=3, delay=30):
def decorator(func):
@wraps(func)
def wrapper(*args, **kwargs):
retries = 0
while retries < max_retries:
try:
return func(*args, **kwargs)
except requests.exceptions.RequestException as e:
retries += 1
if retries == max_retries:
print(f"All {max_retries} attempts failed.")
return 'Error'
print(f"Request failed. Retrying in {delay} seconds... (Attempt {retries}/{max_retries})")
time.sleep(delay)
return wrapper
return decorator


def read_input_file(input_file):
Expand All @@ -29,7 +39,7 @@ def transform_funder_id(funder_id):
return re.sub('http://dx.doi.org/10.13039/', '', funder_id)


@catch_requests_exceptions
@catch_request_exceptions
def query_crossref_api(funder_id, headers):
base_url = "https://api.crossref.org/works"
params = {"filter": f"funder:{funder_id}"}
Expand Down
50 changes: 40 additions & 10 deletions utilities/get_datacite_funder_work_counts.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,28 @@
import csv
import requests
import json
import time
from functools import wraps


def catch_request_exception(max_retries=3, delay=30):
def decorator(func):
@wraps(func)
def wrapper(*args, **kwargs):
retries = 0
while retries < max_retries:
try:
return func(*args, **kwargs)
except requests.exceptions.RequestException as e:
retries += 1
if retries == max_retries:
print(f"All {max_retries} attempts failed.")
return 'Error'
print(f"Request failed. Retrying in {delay} seconds... (Attempt {retries}/{max_retries})")
time.sleep(delay)
return wrapper
return decorator

def parse_arguments():
parser = argparse.ArgumentParser(
description='Retrieve all work counts for funders in DataCite')
parser.add_argument(
'-i', '--input', help='Input CSV file', required=True)
parser.add_argument(
'-o', '--output', help='Output CSV file', default='datacite_funder_work_counts.csv')
return parser.parse_args()

def read_input_file(input_file):
funder_ids = []
Expand All @@ -21,26 +34,43 @@ def read_input_file(input_file):
funder_ids.append(funder['id'])
return funder_ids


def transform_funder_id(funder_id):
return re.sub('http://dx.doi.org/10.13039/','*', funder_id)
return re.sub('http://dx.doi.org/10.13039/', '*', funder_id)


def form_query_url(funder_id):
print(f"https://api.datacite.org/dois?query=fundingReferences.funderIdentifier:{funder_id}")
return f"https://api.datacite.org/dois?query=fundingReferences.funderIdentifier:{funder_id}"


@catch_request_exception()
def query_datacite_api(url):
response = requests.get(url)
response.raise_for_status()
return response.json()


def extract_work_count(response):
return response['meta']['total']


def write_output_csv(output_file, data):
with open(output_file, 'a') as file:
writer = csv.writer(file)
writer.writerow(data)


def parse_arguments():
parser = argparse.ArgumentParser(
description='Retrieve all work counts for funders in DataCite')
parser.add_argument(
'-i', '--input', help='Input CSV file', required=True)
parser.add_argument(
'-o', '--output', help='Output CSV file', default='datacite_funder_work_counts.csv')
return parser.parse_args()


def main():
args = parse_arguments()
funder_ids = read_input_file(args.input)
Expand All @@ -52,7 +82,7 @@ def main():
response = query_datacite_api(url)
work_count = extract_work_count(response)
write_output_csv(args.output, [funder_id, work_count])


if __name__ == "__main__":
main()

0 comments on commit 5e50a6c

Please sign in to comment.