Merge pull request #12 from ror-community/202407-data-update

Merge data update and bug fixes to main
ror-community · Jul 19, 2024 · 5e50a6c · 5e50a6c
2 parents b156846 + 2db4ae6
commit 5e50a6c
Show file tree

Hide file tree

Showing 15 changed files with 46,961 additions and 37,673 deletions.
diff --git a/data/aggregate_mapped.csv b/data/aggregate_mapped.csv
diff --git a/data/aggregate_unmapped.csv b/data/aggregate_unmapped.csv
diff --git a/data/crossref_funder_work_counts.csv b/data/crossref_funder_work_counts.csv
diff --git a/data/crossref_funders.json b/data/crossref_funders.json
diff --git a/data/crossref_overlap.png b/data/crossref_overlap.png
diff --git a/data/datacite_funder_work_counts.csv b/data/datacite_funder_work_counts.csv
diff --git a/data/datacite_funders.json b/data/datacite_funders.json
diff --git a/data/datacite_overlap.png b/data/datacite_overlap.png
diff --git a/data/funders.json b/data/funders.json
diff --git a/data/members.json b/data/members.json
diff --git a/data/ror_funder_registry_mapping.json b/data/ror_funder_registry_mapping.json
diff --git a/main.py b/main.py
@@ -11,9 +11,9 @@
     "DataCite - Aggregrate overlap": DataCite_view
 }
 
-funder_registry_version = '1.56'
-ror_registry_version = '1.42'
-works_count_date = '2024/03/10'
+funder_registry_version = '1.58'
+ror_registry_version = '1.49'
+works_count_date = '2024/07/16'
 
 def main():
     sidebar_title = st.sidebar.title("Views")

diff --git a/utilities/create_funder_id_mapping_w_dl.py b/utilities/create_funder_id_mapping_w_dl.py
@@ -6,10 +6,6 @@
 import os
 
 
-import os
-import requests
-import zipfile
-
 def download_and_unzip(record_id, path='.'):
     # Downloading the record from Zenodo
     response = requests.get(f'https://zenodo.org/api/records/{record_id}')
@@ -30,21 +26,21 @@ def download_and_unzip(record_id, path='.'):
     return None
 
 
-
 def create_mapping_and_output_json(ror_data_file, json_output_file):
     mapping = {}
-    with open(ror_data_file, 'r+') as f_in:
-        ror_data = json.load(f_in)
-    for item in ror_data:
-        ror_id = item.get('id', '')
-        funder_id_all = item.get('external_ids', {}).get('FundRef', {}).get('all', [])
-        if funder_id_all:
-            funder_id_preferred = item.get('external_ids', {}).get('FundRef', {}).get('preferred')
-            if funder_id_preferred:
-                funder_id_all.append(funder_id_preferred)
-            funder_id_all = list(set(funder_id_all))
-            for funder_id in funder_id_all:
-                mapping[funder_id] = ror_id
+    with open(ror_data_file, 'r', encoding="utf8") as f_in:
+        ror_data_list = json.load(f_in)
+    for ror_data in ror_data_list:
+        ror_id = ror_data.get('id', '')
+        external_ids = ror_data.get('external_ids', [])
+        for external_id in external_ids:
+            if external_id.get('type') == 'fundref':
+                funder_ids = external_id.get('all', [])
+                if external_id.get('preferred'):
+                    funder_ids.append(external_id['preferred'])
+                funder_ids = list(set(funder_ids))
+                for funder_id in funder_ids:
+                    mapping[funder_id] = ror_id
     with open(json_output_file, 'w') as json_file:
         json.dump(mapping, json_file)
 
@@ -64,7 +60,7 @@ def delete_files(prefix):
     # Record ID for ROR data dumps
     record_id = input('Record ID for ROR data dump: ')
     prefix = download_and_unzip(record_id)
-    json_file_path = f'{prefix}.json'
+    json_file_path = f'{prefix}_schema_v2.json'
     if os.path.exists(json_file_path):
         outfile = 'ror_funder_registry_mapping.json'
         create_mapping_and_output_json(json_file_path, outfile)

diff --git a/utilities/get_crossref_funder_work_counts.py b/utilities/get_crossref_funder_work_counts.py
@@ -7,13 +7,23 @@
 import requests
 
 
-def catch_requests_exceptions(func):
-    def wrapper(*args, **kwargs):
-        try:
-            return func(*args, **kwargs)
-        except requests.exceptions.RequestException:
-            return 'Error'
-    return wrapper
+def catch_request_exceptions(max_retries=3, delay=30):
+    def decorator(func):
+        @wraps(func)
+        def wrapper(*args, **kwargs):
+            retries = 0
+            while retries < max_retries:
+                try:
+                    return func(*args, **kwargs)
+                except requests.exceptions.RequestException as e:
+                    retries += 1
+                    if retries == max_retries:
+                        print(f"All {max_retries} attempts failed.")
+                        return 'Error'
+                    print(f"Request failed. Retrying in {delay} seconds... (Attempt {retries}/{max_retries})")
+                    time.sleep(delay)
+        return wrapper
+    return decorator
 
 
 def read_input_file(input_file):
@@ -29,7 +39,7 @@ def transform_funder_id(funder_id):
     return re.sub('http://dx.doi.org/10.13039/', '', funder_id)
 
 
-@catch_requests_exceptions
+@catch_request_exceptions
 def query_crossref_api(funder_id, headers):
     base_url = "https://api.crossref.org/works"
     params = {"filter": f"funder:{funder_id}"}

diff --git a/utilities/get_datacite_funder_work_counts.py b/utilities/get_datacite_funder_work_counts.py
@@ -3,15 +3,28 @@
 import csv
 import requests
 import json
+import time
+from functools import wraps
+
+
+def catch_request_exception(max_retries=3, delay=30):
+    def decorator(func):
+        @wraps(func)
+        def wrapper(*args, **kwargs):
+            retries = 0
+            while retries < max_retries:
+                try:
+                    return func(*args, **kwargs)
+                except requests.exceptions.RequestException as e:
+                    retries += 1
+                    if retries == max_retries:
+                        print(f"All {max_retries} attempts failed.")
+                        return 'Error'
+                    print(f"Request failed. Retrying in {delay} seconds... (Attempt {retries}/{max_retries})")
+                    time.sleep(delay)
+        return wrapper
+    return decorator
 
-def parse_arguments():
-    parser = argparse.ArgumentParser(
-        description='Retrieve all work counts for funders in DataCite')
-    parser.add_argument(
-        '-i', '--input', help='Input CSV file', required=True)
-    parser.add_argument(
-        '-o', '--output', help='Output CSV file', default='datacite_funder_work_counts.csv')
-    return parser.parse_args()
 
 def read_input_file(input_file):
     funder_ids = []
@@ -21,26 +34,43 @@ def read_input_file(input_file):
             funder_ids.append(funder['id'])
     return funder_ids
 
+
 def transform_funder_id(funder_id):
-    return re.sub('http://dx.doi.org/10.13039/','*', funder_id)
+    return re.sub('http://dx.doi.org/10.13039/', '*', funder_id)
+
 
 def form_query_url(funder_id):
     print(f"https://api.datacite.org/dois?query=fundingReferences.funderIdentifier:{funder_id}")
     return f"https://api.datacite.org/dois?query=fundingReferences.funderIdentifier:{funder_id}"
 
+
+@catch_request_exception()
 def query_datacite_api(url):
     response = requests.get(url)
     response.raise_for_status()
     return response.json()
 
+
 def extract_work_count(response):
     return response['meta']['total']
 
+
 def write_output_csv(output_file, data):
     with open(output_file, 'a') as file:
         writer = csv.writer(file)
         writer.writerow(data)
 
+
+def parse_arguments():
+    parser = argparse.ArgumentParser(
+        description='Retrieve all work counts for funders in DataCite')
+    parser.add_argument(
+        '-i', '--input', help='Input CSV file', required=True)
+    parser.add_argument(
+        '-o', '--output', help='Output CSV file', default='datacite_funder_work_counts.csv')
+    return parser.parse_args()
+
+
 def main():
     args = parse_arguments()
     funder_ids = read_input_file(args.input)
@@ -52,7 +82,7 @@ def main():
         response = query_datacite_api(url)
         work_count = extract_work_count(response)
         write_output_csv(args.output, [funder_id, work_count])
-    
+
 
 if __name__ == "__main__":
     main()