diff --git a/README.md b/README.md index 86d530e..ec1e443 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,98 @@ # database_migration Python scripts for migrating a MySQL database to a PostgreSQL database with a different structure + +# Database migration scripts for Zacharias Topelius Skrifter, topelius.sls.fi + +As a general rule, the scripts only migrate data concerning collections that are or will be a part of the published edition. + +Data belonging to previously planned but abandoned collections is not migrated. + +The scripts should be run in the following order: + +## 1. migrate_main_tables.py +Enter the project name as the value of the variable project_name before running the script. + +The script creates two json-files which map id:s from the old db to id:s created in the new db, in the id_dictionaries folder. + +## 2. migrate_manuscripts_and_versions.py +This script uses publication_ids.json which was created by script 1. + +The script creates two json-files which map id:s from the old db to id:s created in the new db, in the id_dictionaries folder. + +## 3. migrate_facsimiles.py +This script uses publication_ids.json which was created by script 1 and manuscript_ids.json which was created by script 2. + +The script creates a json-file which map id:s from the old db to id:s created in the new db, in the id_dictionaries folder. + +It creates a log file with info about facsimiles which do not belong to migrated publications. In some cases, these facsimiles appear elsewere on the website and might need special attention. + +## 4. create_comment_data.py +This script uses collection_ids.json which was created by script 1. + +Old collection id:s and relative paths to their comment folders is given as a list of tuples to variable old_collections. + +It creates a log file containing the publications and the comment file paths which matched them. It also creates a log file containing publications for which no comment file path was found. + +## 5. update_publication_with_filepaths.py +This script uses collection_ids.json and publication_ids.json which were created by script 1. + +It imports four functions from script 4, create_comment_data.py. + +Old collection id:s and relative paths to their reading text folders is given as a list of tuples to variable old_collections. + +Publication names and their matching reading text file names for collections Publicistik, Forelasningar and Lasning for barn are stored in three separate documents: ZTS_Publicistik_verk_signum_filer.csv, Forelasningar_signum_filer.csv and Lfb_signum_filer.csv. + +It creates a log file containing the publications and the reading text file paths which matched them. It also creates a log file containing publications for which no reading text file path was found. + +## 6. update_manuscript_with_filepaths.py +This script uses collection_ids.json which was created by script 1. + +Old collection id:s and relative paths to their manuscript folders is given as a list of tuples to variable old_collections. + +It creates a log file containing the publications and the manuscript file paths which matched them and a log file containing publications for which no manuscript file path was found. It also creates a log file containing manuscripts files with the same title; these file paths need to be inserted into the database manually. + +## 7. update_version_with_filepaths.py +This script uses collection_ids.json which was created by script 1. + +Old collection id:s and relative paths to their version folders is given as a list of tuples to variable old_collections. + +The script uses copies of all the version XML files from the web server; they need to be stored in a subfolder named var. + +It creates five log files: one containing the publications and the version file paths which matched them; one containing versions for which no file path was found; one containing publications and version file paths from the matching directory/directories; one containing publication versions for which no matching directory was found; one containing version file paths which were matched several times (they need to be checked manually). + +## 8. create_toc.py +Enter the project id as the value of the variable PROJECT_ID before running the script. + +This script uses collection_ids.json which was created by script 1 and Lfb_split.csv (for Läsning för barn). + +The script fetches info from table tableofcontents in old db and transforms it into one properly ordered toc json file for each new collection. It sorts the toc items based on different values in the db. + +## 9. split_lasning_for_barn.py +This script uses the XML files for Lasning for barn and the list Lfb_split.csv, which contains publication info. + +The script splits the 8 large XML files so that each story/publication is in a separate file. + +The script also creates a csv file (Lfb_signum_filer.csv) mapping legacy id:s with the newly created file paths. This file is used by update_publication_with_filepaths.py. + +## 10. split_lasning_for_barn_comments.py +This script uses Lfb_split.csv, which contains publication info, and Lfb_kommentarer.xml, which contains all the general comments for Lasning for barn. + +The script creates an XML file for each comment, adds the right content and saves the file path. It then inserts comment info into the database, connecting it to the right publication. + +The script also creates a csv file (Lfb_kommentarer_filer.csv) mapping legacy id:s with the newly created file paths. + +## 11. facsimile_url_info.py +This script uses publicistiktabell.csv to create a list of facsimile legacy id:s and url:s. + +It fetches metadata based on url using API and inserts it into table publication_facsimile_collection. The script also inserts id:s into table publication_facsimile. + +## 12. create_introduction_and_title.py +This script uses collection_ids.json, which was created by script 1, and introduction_title_names.csv, which contains the name bases for each title page and introduction XML file. It also needs the current project_id. + +It inserts data for introductions and title pages and updates table publication_collection with the corresponding ids. + +## 13. update_notes.py +This script updates the document_id:s in table documentnote in topelius._notes for collection Läsning för barn. It has to be run only once; document_id:s are not continously updated. The XML files need to be accessible for the script. + +## update_comment_with_lasning_for_barn.py +This script was used for updating table document in db topelius_notes with filepaths to comments for Läsning för barn. \ No newline at end of file diff --git a/create_comment_data.py b/create_comment_data.py new file mode 100644 index 0000000..23c9f0a --- /dev/null +++ b/create_comment_data.py @@ -0,0 +1,150 @@ +"""Script that inserts data into table publication_comment and updates table publication with the corresponding publication_comment_id. +It finds out the original filename for the comment file and inserts it into publication_comment (this info did not exist in the old database). +Created by Anna Movall and Jonas Lillqvist in February 2020""" + +import psycopg2 +import json +from pathlib import Path +import re +from fuzzywuzzy import fuzz + +conn_new_db = psycopg2.connect( + host="", + database="", + user="", + port="", + password="" +) +cursor_new = conn_new_db.cursor() + +def read_dict_from_file(filename): + with open(filename, encoding="utf-8") as source_file: + json_content = json.load(source_file) + return json_content + +# get relevant info from publication table using select with collection id +def get_info_from_publication(new_collection_id): + publication_info = [] + fetch_query = """SELECT id, name, published, legacy_id FROM publication WHERE publication_collection_id = %s""" + cursor_new.execute(fetch_query, (new_collection_id,)) + publication_info = cursor_new.fetchall() + return publication_info + +# create path object for folder from given filepath string, save all paths to files found in this folder or subfolders in a list +def create_file_list(filepath): + path = Path(filepath) + filelist = [] + iterate_through_folders(path, filelist) + return filelist + +# iterate through folders recursively and append filepaths to list +def iterate_through_folders(path, filelist): + for content in path.iterdir(): + if content.is_dir(): + iterate_through_folders(content, filelist) + else: + filelist.append(content) + +# compare publication name with the collection's file names to find file path to general comment (or reading text, when called upon from insert_filepaths.py) +def compare_pubnames_with_filenames(publication_name, filepath_list, match_count, publication_count): + # remove special characters from publication names + search_str = re.sub(r",|\.|\?|!|–|’|»|:|(|)|\[|\]|&", "", publication_name).strip() + search_str = search_str.replace(" ", "_").lower() + search_str = search_str.replace("-", "_") + search_str = search_str.replace("ä", "a") + search_str = search_str.replace("å", "a") + search_str = search_str.replace("ö", "o") + search_str = search_str.replace("é", "e") + search_str = search_str.replace("ü", "u") + search_str = search_str.replace("æ", "ae") + found = False + i = 0 + # for filepath in filepath_list: + while found == False and i < len(filepath_list): + original_path = filepath_list[i] + # get filename without suffix + filepath = filepath_list[i].stem + # remove special characters and useless stuff from filename + filepath = filepath.replace("K ", "") + filepath = filepath.replace(" tg", "") + filepath = filepath.replace(" ", "_").lower() + filepath = filepath.replace("-", "_") + filepath = filepath.replace("ä", "a") + filepath = filepath.replace("å", "a") + filepath = filepath.replace("ö", "o") + filepath = filepath.replace("é", "e") + filepath = filepath.replace("æ", "ae") + filepath = filepath.replace("æ", "ae") + filepath = re.sub(r"_komm$|\[|\]", "", filepath) + filepath = filepath.replace("_Academica", "") + filepath = filepath.replace("brev_komm_", "") # for letters + # compare publication name with file name: + if fuzz.partial_ratio(search_str, filepath) == 100: + found = True + match_count += 1 + break + i += 1 + if not found: + original_path = None + publication_count += 1 + return original_path, match_count, publication_count + + +def main(): + collection_id_dict = read_dict_from_file("id_dictionaries/collection_ids.json") + # list of all collections with collection id and path to folder with general comments; collections without general comments use a template file: + old_collections = [(1, "../../Topelius SVN/documents/Redaktionella_texter/Kommentarer/Ljungblommor"), (2, "../../Topelius SVN/documents/Redaktionella_texter/Kommentarer/Nya_blad_och_Ljung"), (4, "../../Topelius SVN/documents/Redaktionella_texter/Kommentarer/Noveller"), (5, "../../Topelius SVN/documents/Redaktionella_texter/Kommentarer/Hertiginnan_af_Finland_och_andra_historiska_noveller"), (7, "../../Topelius SVN/documents/Redaktionella_texter/Kommentarer/Vinterqvallar"), (12, "../../Topelius SVN/documents/Redaktionella_texter/Kommentarer/Finland_framstalldt_i_teckningar"), (16, "../../Topelius SVN/documents/Redaktionella_texter/Kommentarer/Ovrig_lyrik"), (18, "../../Topelius SVN/documents/Redaktionella_texter/Kommentarer/Noveller_och_kortprosa"), (24, "../../Topelius SVN/documents/Redaktionella_texter/Kommentarer/Academica"), (30, "../../Topelius SVN/documents/Redaktionella_texter/Kommentarer/Brev/Forlagskorrespondens"), (6, "templates/comment.xml"), (8, "templates/comment.xml"), (10, "templates/comment.xml"), (13, "templates/comment.xml"), (20, "templates/comment.xml"), (22, "templates/comment.xml"), (23, "templates/comment.xml"), (29, "templates/comment.xml"), (31, "templates/comment.xml")] + template_path = "templates/comment.xml" + # initialize counters for match log statistics + publication_count = 0 + match_count = 0 + # create log files + log_found = open("logs/matched_comments.txt", "w", encoding="utf-8") + log_not_found = open("logs/unmatched_comments.txt", "w", encoding="utf-8") + # loop through collections and publications in them + for collection in old_collections: + old_id = collection[0] + collection_path = collection[1] + new_collection_id = collection_id_dict[str(old_id)] # get new collection id using dictionary + publication_info = get_info_from_publication(new_collection_id) # select publications with this collection id from table publication + # get all file paths from collection's folder, if there is one + if collection_path != template_path: + filepath_list = create_file_list(collection_path) + # get info about one publication, match name with file path if needed, create a row in publication_comment and update publication with comment id + for tuple in publication_info: + publication_name = tuple[1] + # check if collection has a general comment; if yes, get the comment's filepath through the comparison function + if collection_path != template_path: + comment_filepath, match_count, publication_count = compare_pubnames_with_filenames(publication_name, filepath_list, match_count, publication_count) + # if the publication has a matching file path, write match to log file and store file path in shortened form in a variable + if comment_filepath is not None: + log_found.write("PUBLICATION: " + publication_name + " MATCHED " + comment_filepath.as_posix() + "\n") + original_filename = comment_filepath.as_posix().replace("../../Topelius SVN/", "") # create filepath string and shorten it + # use Null value if there is no matching file path + else: + original_filename = None + log_not_found.write("Publication name: " + publication_name + "\n") + # if there is no general comment, use template path for original filename + else: + original_filename = template_path + published = tuple[2] + legacy_id = tuple[3] + # insert file path or template path and some info about the publication into table publication_comment + insert_query = """INSERT INTO publication_comment(published, legacy_id, original_filename) VALUES (%s, %s, %s) RETURNING id""" + values_to_insert = (published, legacy_id, original_filename) + cursor_new.execute(insert_query, values_to_insert) + # get newly created comment id + comment_id = cursor_new.fetchone()[0] + publication_id = tuple[0] + # update table publication with the comment id for this publication + update_query = """UPDATE publication SET publication_comment_id = %s WHERE id = %s""" + values_to_insert = (comment_id, publication_id) + cursor_new.execute(update_query, values_to_insert) + conn_new_db.commit() + log_found.write("\nPublications matched: " + str(match_count) + "/" + str(publication_count) + ". Percentage matched: " + str(match_count/publication_count*100)) + log_found.close() + log_not_found.close() + conn_new_db.close() + cursor_new.close() + +main() \ No newline at end of file diff --git a/create_introduction_and_title.py b/create_introduction_and_title.py new file mode 100644 index 0000000..a3e3ce7 --- /dev/null +++ b/create_introduction_and_title.py @@ -0,0 +1,97 @@ +"""Script that inserts data for introductions and title pages into db and updates table publication_collection with the corresponding id:s. +Created by Anna Movall and Jonas Lillqvist in April 2020""" + +import psycopg2 +import re +import json + +conn_new_db = psycopg2.connect( + host="", + database="", + user="", + port="", + password="" +) +cursor_new = conn_new_db.cursor() + +PROJECT_ID = 10 # enter the current project id +INTRODUCTION_FILE_PATH = "documents/Redaktionella_texter/Inledningar/" +TITLE_PAGE_FILE_PATH = "documents/Redaktionella_texter/Titelsidor/" + +# get relevant info from table publication_collection +def get_info_from_publication_collection(PROJECT_ID): + collection_info = [] + fetch_query = """SELECT id, published FROM publication_collection WHERE project_id = %s""" + cursor_new.execute(fetch_query, (PROJECT_ID,)) + collection_info = cursor_new.fetchall() + return collection_info + +# the name bases for the xml files (title and introduction) are stored in a csv together with old collection ids +def create_list_from_csv(filename): + with open(filename, "r", encoding="utf-8") as source_file: + collection_name_list = [] + for line in source_file: + row = line.rstrip() + elements = row.split(";") + collection_name_list.append(elements) + return collection_name_list + +def read_dict_from_file(filename): + with open(filename, encoding="utf-8") as source_file: + json_content = json.load(source_file) + return json_content + +# create a dictionary with new collection id as key and file name base as value +def create_collection_name_dict(collection_names_with_old_id, collection_id_dict): + coll_name_dict = {} + for row in collection_names_with_old_id: + old_coll_id = row[0] + coll_name = row[1] + new_coll_id = collection_id_dict[old_coll_id] + coll_name_dict[new_coll_id] = coll_name + return coll_name_dict + +# insert data into table publication_collection_introduction +def create_publication_collection_introduction(published, introduction_original_filename): + insert_query = """INSERT INTO publication_collection_introduction(published, original_filename) VALUES (%s, %s) RETURNING id""" + values_to_insert = (published, introduction_original_filename) + cursor_new.execute(insert_query, values_to_insert) + introduction_id = cursor_new.fetchone()[0] + return introduction_id + +# insert data into table publication_collection_title +def create_publication_collection_title(published, title_page_original_filename): + insert_query = """INSERT INTO publication_collection_title(published, original_filename) VALUES (%s, %s) RETURNING id""" + values_to_insert = (published, title_page_original_filename) + cursor_new.execute(insert_query, values_to_insert) + title_page_id = cursor_new.fetchone()[0] + return title_page_id + +# update table publication_collection with the ids for introduction and title page +def update_publication_collection(introduction_id, title_page_id, collection_id): + update_query = """UPDATE publication_collection SET publication_collection_introduction_id=%s, publication_collection_title_id=%s WHERE id=%s""" + values_to_insert = (introduction_id, title_page_id, collection_id) + cursor_new.execute(update_query, values_to_insert) + +def main(): + collection_info = get_info_from_publication_collection(PROJECT_ID) + # create list of old collection ids and collection names for file name bases + collection_names_with_old_id = create_list_from_csv("csv/introduction_title_names.csv") + # create dict mapping old and new collection ids + collection_id_dict = read_dict_from_file("id_dictionaries/collection_ids.json") + # create dict mapping new ids and file name bases + collection_name_dict_with_new_ids = create_collection_name_dict(collection_names_with_old_id, collection_id_dict) + for collection in collection_info: + collection_id = collection[0] + published = collection[1] + name = collection_name_dict_with_new_ids[collection_id] + introduction_original_filename = INTRODUCTION_FILE_PATH + name + "_inl.xml" + introduction_id = create_publication_collection_introduction(published, introduction_original_filename) + title_page_original_filename = TITLE_PAGE_FILE_PATH + name + "_tit.xml" + title_page_id = create_publication_collection_title(published, title_page_original_filename) + update_publication_collection(introduction_id, title_page_id, collection_id) + conn_new_db.commit() + cursor_new.close() + conn_new_db.close() + +main() \ No newline at end of file diff --git a/create_toc.py b/create_toc.py new file mode 100644 index 0000000..20c350c --- /dev/null +++ b/create_toc.py @@ -0,0 +1,212 @@ +""" +Script that fetches info from table tableofcontents in old db and transforms it +into one toc json file for each new collection. +Created by Anna Movall and Jonas Lillqvist in March 2020. +""" + +import mysql.connector +import psycopg2 +import json +import operator +import re + +# insert current project id here +PROJECT_ID = 10 + +conn_old_db = mysql.connector.connect( + host="", + database="", + user="", + passwd="" +) +cursor_old = conn_old_db.cursor() + +conn_new_db = psycopg2.connect( + host="", + database="", + user="", + port="", + password="" +) +cursor_new = conn_new_db.cursor() + +def read_dict_from_file(filename): + with open(filename, encoding="utf-8") as source_file: + json_content = json.load(source_file) + return json_content + +# get info about toc items in one collection from old db +def get_toc_info(old_collection_id): + # collection Letters (Brev) was split into two new collections (30, 31), which are not part of old table publications or tableofcontents, only table publications_collections + # we need to change their old collection id to the actual old id 15 and not use publications_collections id 30/31, found in the coll_id_dict + if old_collection_id == "30": + fetch_query = """SELECT tableofcontents.title, toc_date, toc_linkID, tableofcontents.sortOrder, publications_group.sortOrder FROM tableofcontents, publications_group WHERE toc_zts_id=%s AND toc_coll_id=%s AND toc_group_id=group_id""" + values_to_insert = (15, 1) + elif old_collection_id == "31": + fetch_query = """SELECT title, toc_date, toc_linkID, sortOrder FROM tableofcontents WHERE toc_zts_id=%s AND toc_coll_id=%s""" + values_to_insert = (15, 2) + else: + fetch_query = """SELECT title, toc_date, toc_linkID, sortOrder FROM tableofcontents WHERE toc_zts_id=%s""" + values_to_insert = (old_collection_id,) + cursor_old.execute(fetch_query, values_to_insert) + toc_info = cursor_old.fetchall() + # the date value for Brev needs to be edited so that None is substituted with "0" + # otherwise sorting by date is not possible + # to be editable, the tuples in toc_info need to be lists + if old_collection_id == "30" or old_collection_id == "31": + toc_info_list = [] + for tuple in toc_info: + row_list = list(tuple) + if row_list[1] is None: + row_list[1] = "0" + toc_info_list.append(row_list) + # for Forlagskorrespondens, sort based on publications_group.sortOrder, then based on date + if old_collection_id == "30": + toc_info_sorted = sorted(toc_info_list, key = operator.itemgetter(4,1)) + # for Foraldrakorrespondens, sort based on date + elif old_collection_id == "31": + toc_info_sorted = sorted(toc_info_list, key = operator.itemgetter(1)) + # for other collections, sort based on sortOrder + else: + toc_info_sorted = sorted(toc_info, key = operator.itemgetter(3)) + return toc_info_sorted + +# creates toc dictionary, used for json file +def create_dictionary(toc_info_sorted, old_collection_id, new_collection_id): + # use this if the dictionary has collections with no publications in the db + if len(toc_info_sorted) == 0: + collection_toc_dict = {} + print("List empty. Collection id old/new: ", old_collection_id, new_collection_id) + return False + # the first row in the list contains the name of the collection + collection_name = toc_info_sorted[0][0] + # create first level of dictionary as required for json toc + collection_toc_dict = {"text": collection_name, "collectionId": str(new_collection_id), "type": "title", "children": []} + # loop through toc_info_sorted, skip first row which contains collection name + for i in range(1, len(toc_info_sorted)): + row = toc_info_sorted[i] + text_title = row[0] + toc_date = row[1] + toc_linkID = row[2] + # skip rows which refer to letters listed in tableofcontents, with no publication linked to them; these have "Mibr" in toc_linkID + if toc_linkID is not None: + match = re.search("Mibr", toc_linkID) + if match is not None: + continue + # an item id is required for the json toc items + itemId = add_itemId(row, old_collection_id, new_collection_id) + # special rule for Forelasningar, which contains descriptions pertaining to a title + # the text should be added to the previous toc item with key "description"; no new toc item is created + if itemId == "" and old_collection_id == "20": + toc_item_dict["description"] = text_title + continue + # toc items which stand for sections and do not link to texts have no itemId + elif itemId == "": + toc_type = "section_title" + # items with itemId are links to reading texts, with type est + else: + toc_type = "est" + if toc_date is None: + toc_date = "" + # create dict for toc item and append it to list of children of first level dict + toc_item_dict = {"url": "", "type": toc_type, "text": text_title, "itemId": itemId, "date": toc_date} + collection_toc_dict["children"].append(toc_item_dict) + return collection_toc_dict + +# constructs itemId based on new collection id and new publication id +# to find out publication id, we need legacy_id +# which is constructed using old collection id and toc_linkID from old db +def add_itemId(row, old_collection_id, new_collection_id): + toc_linkID = row[2] + if toc_linkID is None or toc_linkID == "": + itemId = "" + # a toc_linkID that contains ch + 1-3 digits and possibly pos + 1-3 digits is a special kind of link + # it refers to a part of a file (a div or an anchor-element) + # the part beginning with ch or pos, the fragment_id, needs to be added to itemID + # but it has to be removed from toc_linkID before constructing the legacy_id + else: + pattern = re.compile(r";(ch\d{1,3}(;pos\d{1,4})?)") + match = re.search(pattern, toc_linkID) + if match is not None: + fragment_id = match.group(1) + toc_linkID = re.sub(pattern, "", toc_linkID) + else: + fragment_id = 0 + # collection Letters (Brev) was split into two new collections (30, 31), which are not part of old table publications or tableofcontents, only table publications_collections + # we need to change their old collection id to the actual old id 15 and not use publications_collections id 30/31, found in the coll_id_dict + if old_collection_id == "30" or old_collection_id == "31": + old_collection_id = "15" + legacy_id = old_collection_id + "_" + toc_linkID + publication_id = fetch_publication_id(legacy_id) + if fragment_id == 0: + itemId = str(new_collection_id) + "_" + str(publication_id) + else: + itemId = str(new_collection_id) + "_" + str(publication_id) + "_" + fragment_id + return itemId + +# get publication_id from new db using legacy_id +# limit selection to publications connected to the current publication collection +def fetch_publication_id(legacy_id): + fetch_query = """SELECT id FROM publication WHERE legacy_id=%s AND publication_collection_id IN (SELECT id FROM publication_collection WHERE project_id=%s)""" + value_to_insert = (legacy_id, PROJECT_ID) + cursor_new.execute(fetch_query, value_to_insert) + result = cursor_new.fetchone() + if result is None: + print(legacy_id, "not found in publication") + publication_id = "" + else: + publication_id = result[0] + return publication_id + +def write_dict_to_file(dictionary, filename): + json_dict = json.dumps(dictionary, ensure_ascii=False) + with open(filename, "w", encoding="utf-8") as output_file: + output_file.write(json_dict) + print("Dictionary written to file", filename) + +# special function for generating toc for Lfb: values from csv, not from table tableofcontents +def create_toc_for_Lfb(filename, collection_id_dict): + lfb_list = create_list_from_csv(filename) + new_collection_id = collection_id_dict["32"] + collection_toc_dict = {"text": "Läsning för barn", "collectionId": str(new_collection_id), "type": "title", "children": []} + for row in lfb_list: + title = row[0] + legacy_id = row[3] + publication_id = fetch_publication_id(legacy_id) + itemId = str(new_collection_id) + "_" + str(publication_id) + toc_item_dict = {"url": "", "type": "est", "text": title, "itemId": itemId, "date": ""} + collection_toc_dict["children"].append(toc_item_dict) + filename = "toc_files/" + str(new_collection_id) + ".json" + write_dict_to_file(collection_toc_dict, filename) + +# creates a list from csv file with publication name, div id, publication id and legacy id +def create_list_from_csv(filename): + with open(filename, "r", encoding="utf-8") as source_file: + lfb_list = [] + for line in source_file: + row = line.rstrip() + elements = row.split(";") + lfb_list.append(elements) + return lfb_list + +def main(): + collection_id_dict = read_dict_from_file("id_dictionaries/collection_ids.json") + for key in collection_id_dict.keys(): + old_collection_id = key # string value! + # toc for Lasning for barn is created from csv file with a special function + if old_collection_id == "32": + continue + new_collection_id = collection_id_dict[old_collection_id] + toc_info_sorted = get_toc_info(old_collection_id) + # toc_info_sorted will be an empty list if key is not found in tableofcontents! + collection_toc_dict = create_dictionary(toc_info_sorted, old_collection_id, new_collection_id) + if collection_toc_dict: + filename = "toc_files/" + str(new_collection_id) + ".json" + write_dict_to_file(collection_toc_dict, filename) + create_toc_for_Lfb("csv/Lfb_split.csv", collection_id_dict) + conn_new_db.close() + cursor_new.close() + conn_old_db.close() + cursor_old.close() + +main() \ No newline at end of file diff --git a/facsimile_url_info.py b/facsimile_url_info.py new file mode 100644 index 0000000..dec3cc1 --- /dev/null +++ b/facsimile_url_info.py @@ -0,0 +1,136 @@ +"""Script that fetches metadata based on url using API and inserts it into table publication_facsimile_collection. The script also inserts id:s into table publication_facsimile. +Created by Anna Movall and Jonas Lillqvist in March 2020. +Information about the API (see section see OAI-PMH): +https://wiki.helsinki.fi/display/Comhis/Interfaces+of+digi.kansalliskirjasto.fi""" + +import psycopg2 +import re +import requests +from bs4 import BeautifulSoup +from datetime import date + +conn_new_db = psycopg2.connect( + host="", + database="", + user="", + port="", + password="" +) +cursor_new = conn_new_db.cursor() + +CSV_FILEPATH = "" + +# creates a list from csv file containing old publication id and facsimile url +def create_list_from_csv(filename): + with open(filename, "r", encoding="utf-8") as source_file: + facsimile_url_list = [] + for line in source_file: + row = line.rstrip() + elements = row.split(";") + legacy_id = elements[0] + url = elements[1] + publication_id = get_publication_id_from_legacy_id(legacy_id) + if publication_id: + facsimile_url_list.append([publication_id, url]) + return facsimile_url_list + +# get new id from table publication using old id +def get_publication_id_from_legacy_id(legacy_id): + fetch_query = """SELECT id FROM publication WHERE legacy_id=%s""" + value_to_insert = (legacy_id,) + cursor_new.execute(fetch_query, value_to_insert) + result = cursor_new.fetchone() + if result: + publication_id = result[0] + else: + publication_id = False + return publication_id + +# fetches metadata for each facsimile using api +# the api request url contains the binding_id, which is part of the facsimile url +# the date and title metadata is then processed to an html string in the right format and appended to the (sub)list +def add_metadata_to_list(facsimile_url_list): + for row in facsimile_url_list: + url = row[1] + binding_id = get_binding_id(url) + api_url = "https://digi.kansalliskirjasto.fi/interfaces/OAI-PMH?verb=GetRecord&metadataPrefix=oai_dc&identifier=oai:digi.kansalliskirjasto.fi:" + binding_id + r = requests.get(api_url) + metadata_soup = BeautifulSoup(r.content, "xml") + title = metadata_soup.find("dc:title").get_text() + date_string = metadata_soup.find("dc:date").get_text() + try: + date_object = date.fromisoformat(date_string) + year = date_object.year + month = date_object.month + day = date_object.day + date_info = str(day) + "/" + str(month) + " " + str(year) + #if the date has no month and day, use it as it is + except ValueError: + print("Invalid iso date.") + date_info = date_string + # split the title info into two elements at comma, to separate the journal title and nr info + title_elements = title.split(",") + if len(title_elements) == 2: + pub_title = title_elements[0] + pub_nr = title_elements[1].lstrip() + pub_nr = pub_nr.replace("nr:", "nr") + db_title = "" + pub_title + " " + date_info + ", " + pub_nr + row.append(db_title) + print(row) + # if the title has no nr info, use it as it is + else: + db_title = "" + title + ", " + date_info + row.append(db_title) + print(row) + print(url, "has irregular metadata: ", title) + +# finds the binding_id component in facsimile url +def get_binding_id(url): + pattern = re.compile(r"/(\d{4,7})\?") + match = re.search(pattern, url) + binding_id = match.group(1) + return binding_id + +# the csv facsimile url list was in chronological order +# this order needs to be preserved in the db as the value of priority in publication_facsimile +# the function gives the first row with a given id priority 1, the second priority 2 etc. +def set_facsimile_order(facsimile_url_list): + publication_id = 0 + for row in facsimile_url_list: + if row[0] != publication_id: + priority = 1 + else: + priority += 1 + row.append(priority) + publication_id = row[0] + +# inserts needed values into table publication_facsimile_collection, returning the newly created id for the facsimile +# calls function create_publication_facsimile which inserts facsimile id and publication id into table publication_facsimile +def create_publication_facsimile_collection(facsimile_url_list): + insert_query = """INSERT INTO publication_facsimile_collection(title, external_url) VALUES (%s, %s) RETURNING id""" + for row in facsimile_url_list: + publication_id = row[0] + url = row[1] + title = row[2] + priority = row[3] + values_to_insert = (title, url) + cursor_new.execute(insert_query, values_to_insert) + facsimile_id = cursor_new.fetchone()[0] + create_publication_facsimile(publication_id, facsimile_id, priority) + +# inserts facsimile id and publication id into table publication_facsimile +def create_publication_facsimile(publication_id, facsimile_id, priority): + insert_query = """INSERT INTO publication_facsimile(publication_facsimile_collection_id, publication_id, page_nr, section_id, priority, type) VALUES (%s, %s, %s, %s, %s, %s)""" + values_to_insert = (facsimile_id, publication_id, 0, 0, priority, 0) + cursor_new.execute(insert_query, values_to_insert) + +def main(): + facsimile_url_list = create_list_from_csv(CSV_FILEPATH) + add_metadata_to_list(facsimile_url_list) + set_facsimile_order(facsimile_url_list) + create_publication_facsimile_collection(facsimile_url_list) + conn_new_db.commit() + cursor_new.close() + conn_new_db.close() + +main() \ No newline at end of file diff --git a/migrate_facsimiles.py b/migrate_facsimiles.py new file mode 100644 index 0000000..8a32dcc --- /dev/null +++ b/migrate_facsimiles.py @@ -0,0 +1,112 @@ +"""Script that migrates data to tables publication_facsimile_collection and publication_facsimile. +Created by Anna Movall and Jonas Lillqvist in January 2020""" + +import mysql.connector +import psycopg2 +import json + +conn_old_db = mysql.connector.connect( + host="", + database="", + user="", + passwd="" +) +cursor_old = conn_old_db.cursor() + +conn_new_db = psycopg2.connect( + host="", + database="", + user="", + port="", + password="" +) +cursor_new = conn_new_db.cursor() + +def read_dict_from_file(filename): + with open(filename, encoding="utf-8") as source_file: + json_content = json.load(source_file) + return json_content + +def write_dict_to_file(dictionary, filename): + json_dict = json.dumps(dictionary) + with open(filename, "w", encoding="utf-8") as output_file: + output_file.write(json_dict) + print("Dictionary written to file", filename) + +def write_text_to_file(text, filename): + with open(filename, "w", encoding="utf-8") as output_file: + output_file.write(text) + print("Text written to file", filename) + +def create_facsimile_collection(): + fetch_query = """SELECT publication_id, title, description, pages, pre_page_count, pages_comment, facs_url FROM facsimiles""" + cursor_old.execute(fetch_query) + old_tuples = cursor_old.fetchall() + insert_query = """INSERT INTO publication_facsimile_collection(title, description, number_of_pages, start_page_number, page_comment, external_url) VALUES (%s, %s, %s, %s, %s, %s) RETURNING id""" + facsimile_coll_id_dict = {} + for tuple in old_tuples: + old_id = tuple[0] + values_to_insert = (tuple[1], tuple[2], tuple[3], tuple[4], tuple[5], tuple[6]) + cursor_new.execute(insert_query, values_to_insert) + new_id = cursor_new.fetchone()[0] + facsimile_coll_id_dict[old_id] = new_id + conn_new_db.commit() + return facsimile_coll_id_dict + +# this table connects publications (= texts) and facsimiles +def create_publication_facsimile(publication_id_dict, facsimile_coll_id_dict, manuscript_id_dict): + fetch_query = """SELECT publications_id, section_id, facs_id, page_nr, priority, type, ms_id FROM facsimile_publications""" + cursor_old.execute(fetch_query) + old_tuples = cursor_old.fetchall() + insert_query = """INSERT INTO publication_facsimile(publication_id, section_id, publication_facsimile_collection_id, page_nr, priority, type, publication_manuscript_id) VALUES (%s, %s, %s, %s, %s, %s, %s)""" + excluded_tuples = "" # for saving info about tuples in the old db that are excluded from migration + for tuple in old_tuples: + old_publication_id = tuple[0] + # do not include tuples which refer to unpublished texts: + if str(old_publication_id) not in publication_id_dict.keys(): + excluded_tuples += "tuple with old_publication_id: " + str(old_publication_id) + " skipped \n" + continue + publication_id = publication_id_dict[str(old_publication_id)] #get new id from dictionary using old id as key; this value is a string in the json dictionary file + section_id = tuple[1] + # the new db requires an int value for section_id and does not accept null + if section_id is None: + section_id = 0 + if isinstance(section_id, str): # check that the old value is a string before making replace, to avoid error + section_id = int(section_id.replace("ch", "")) # section_id is of type int in new db + old_facsimile_id = tuple[2] + if str(old_facsimile_id) not in facsimile_coll_id_dict.keys(): # skip tuples which refer to unpublished facsimiles + excluded_tuples += "tuple with old_facsimile_id: " + str(old_facsimile_id) + " skipped\n" + continue + facsimile_id = facsimile_coll_id_dict[str(tuple[2])] + page_nr = tuple[3] + priority = tuple[4] + type = tuple[5] + old_manuscript_id = tuple[6] + # only NULL, 0 or values in the dictionary are allowed for ms_id; otherwise the tuple should be skipped because it refers to an unpublished manuscript + if old_manuscript_id is not None and old_manuscript_id != 0 and str(old_manuscript_id) not in manuscript_id_dict.keys(): + excluded_tuples += "tuple with old_ms_id: " + str(old_manuscript_id) + " skipped\n" + continue + if str(old_manuscript_id) in manuscript_id_dict.keys(): + manuscript_id = manuscript_id_dict[str(old_manuscript_id)] # get new id from dictionary using old id as key + elif old_manuscript_id == 0: + manuscript_id = None # use NULL if old value is 0 + else: + manuscript_id = None # otherwise the old value is NULL, which is preserved + values_to_insert = (publication_id, section_id, facsimile_id, page_nr, priority, type, manuscript_id) + cursor_new.execute(insert_query, values_to_insert) + conn_new_db.commit() + write_text_to_file(excluded_tuples, "logs/excluded_facsimile_publications_tuples.txt") + +def main(): + facsimile_coll_id_dict = create_facsimile_collection() + write_dict_to_file(facsimile_coll_id_dict, "id_dictionaries/facsimile_coll_ids.json") + publication_id_dict = read_dict_from_file("id_dictionaries/publication_ids.json") + facsimile_coll_id_dict = read_dict_from_file("id_dictionaries/facsimile_coll_ids.json") + manuscript_id_dict = read_dict_from_file("id_dictionaries/manuscript_ids.json") + create_publication_facsimile(publication_id_dict, facsimile_coll_id_dict, manuscript_id_dict) + cursor_new.close() + conn_new_db.close() + conn_old_db.close() + cursor_old.close() + +main() \ No newline at end of file diff --git a/migrate_main_tables.py b/migrate_main_tables.py index a995591..d7a8b2a 100644 --- a/migrate_main_tables.py +++ b/migrate_main_tables.py @@ -5,7 +5,7 @@ import psycopg2 import json -PROJECT_NAME = "ZTS_2020_05_14" +PROJECT_NAME = "" conn_old_db = mysql.connector.connect( host="", @@ -22,7 +22,6 @@ port="", password="" ) - cursor_new = conn_new_db.cursor() def create_project(PROJECT_NAME): @@ -120,4 +119,4 @@ def main(): conn_old_db.close() cursor_old.close() -main() +main() \ No newline at end of file diff --git a/migrate_manuscripts_and_versions.py b/migrate_manuscripts_and_versions.py new file mode 100644 index 0000000..2b64e2f --- /dev/null +++ b/migrate_manuscripts_and_versions.py @@ -0,0 +1,101 @@ +"""Script that migrates data to tables publication_manuscript and publication_version. +Created by Anna Movall and Jonas Lillqvist in January 2020""" + +import mysql.connector +import psycopg2 +import json + +conn_old_db = mysql.connector.connect( + host="", + database="", + user="", + passwd="" +) +cursor_old = conn_old_db.cursor() + +conn_new_db = psycopg2.connect( + host="", + database="", + user="", + port="", + password="" +) +cursor_new = conn_new_db.cursor() + +def read_dict_from_file(filename): + with open(filename, encoding="utf-8") as source_file: + json_content = json.load(source_file) + return json_content + +def create_publication_manuscript(publication_id_dict): + fetch_query = """SELECT m_id, m_publication_id, m_title, m_sort, m_filename, m_type, m_section_id FROM manuscripts""" + cursor_old.execute(fetch_query) + old_tuples = cursor_old.fetchall() + insert_query = """INSERT INTO publication_manuscript(publication_id, name, sort_order, legacy_id, type, section_id) VALUES (%s, %s, %s, %s, %s, %s) RETURNING id""" + manuscript_id_dictionary = {} + for tuple in old_tuples: + # if a manuscript does not belong to a published publication, it should not be migrated + old_publication_id = tuple[1] + if str(old_publication_id) not in publication_id_dict.keys(): + continue + old_id = tuple[0] + publication_id = publication_id_dict[str(old_publication_id)] # get new id from dictionary using old id as key; this value is a string in the json dictionary file + title = tuple[2] + sort_order = tuple[3] + legacy_id = tuple[4] + type = tuple[5] + section_id = tuple[6] + if section_id is not None: + section_id = int(section_id.replace("ch", "")) # remove ch, the id is an int in the new db + values_to_insert = (publication_id, title, sort_order, legacy_id, type, section_id) + cursor_new.execute(insert_query, values_to_insert) + new_id = cursor_new.fetchone()[0] + manuscript_id_dictionary[old_id] = new_id + conn_new_db.commit() + return manuscript_id_dictionary + +def create_publication_version(publication_id_dict): + fetch_query = """SELECT v_id, v_publication_id, v_title, v_sort, v_type, v_filename, v_section_id FROM versions""" + cursor_old.execute(fetch_query) + old_tuples = cursor_old.fetchall() + insert_query = """INSERT INTO publication_version (publication_id, name, sort_order, type, legacy_id, section_id) VALUES (%s, %s, %s, %s, %s, %s) RETURNING id""" + version_id_dict = {} + for tuple in old_tuples: + old_publication_id = tuple[1] + # if a version does not belong to a published publication, it should not be migrated + if str(old_publication_id) not in publication_id_dict.keys(): + continue + old_id = tuple[0] + publication_id = publication_id_dict[str(old_publication_id)] # get new id from dictionary using old id as key; this value is a string in the json dictionary file + title = tuple[2] + sort_order = tuple[3] + type = tuple[4] + legacy_id = tuple[5] + section_id = tuple[6] + if section_id is not None: + section_id = int(section_id.replace("ch", "")) # remove ch, the id is an int in the new db + values_to_insert = (publication_id, title, sort_order, type, legacy_id, section_id) + cursor_new.execute(insert_query, values_to_insert) + new_id = cursor_new.fetchone()[0] + version_id_dict[old_id] = new_id + conn_new_db.commit() + return version_id_dict + +def write_dict_to_file(dictionary, filename): + json_dict = json.dumps(dictionary) + with open(filename, "w", encoding="utf-8") as output_file: + output_file.write(json_dict) + print("Dictionary written to file.") + +def main(): + publication_id_dict = read_dict_from_file("id_dictionaries/publication_ids.json") + manuscript_id_dict = create_publication_manuscript(publication_id_dict) + write_dict_to_file(manuscript_id_dict, "id_dictionaries/manuscript_ids.json") + version_id_dict = create_publication_version(publication_id_dict) + write_dict_to_file(version_id_dict, "id_dictionaries/version_ids.json") + conn_new_db.close() + cursor_new.close() + conn_old_db.close() + cursor_old.close() + +main() \ No newline at end of file diff --git a/split_lasning_for_barn.py b/split_lasning_for_barn.py new file mode 100644 index 0000000..5dc7f34 --- /dev/null +++ b/split_lasning_for_barn.py @@ -0,0 +1,164 @@ +"""Script for splitting the 8 large xml files of Läsning för barn so that each story is in a separate file. The script also creates a csv file mapping legacy id:s with the newly created file paths. +Created by Anna Movall and Jonas Lillqvist in March 2020""" + +import os +from pathlib import Path +import re +from bs4 import BeautifulSoup + +XML_OUTPUT_FOLDER = "Lfb_split_files/" + +# creates a list from csv file with publication name, div id, publication id and legacy id +def create_list_from_csv(filename): + with open(filename, "r", encoding="utf-8") as source_file: + lfb_list = [] + for line in source_file: + row = line.rstrip() + elements = row.split(";") + lfb_list.append(elements) + return lfb_list + +# creates a folder for each of the 8 parts +def create_directories(directory_name_base): + for i in range(1,9): + dir_name = XML_OUTPUT_FOLDER + directory_name_base + str(i) + if not os.path.exists(dir_name): + os.makedirs(dir_name) + +# creates a path object for a folder from a given filepath string, saves all paths to files found in this folder or subfolders in a list +def create_file_list(filepath): + path = Path(filepath) + filelist = [] + iterate_through_folders(path, filelist) + return filelist + +# iterates through folders recursively and appends filepaths to list +def iterate_through_folders(path, filelist): + for content in path.iterdir(): + if content.is_dir(): + iterate_through_folders(content, filelist) + elif content.suffix == ".xml": + filelist.append(content) + +# save content of each large file in a dictionary with its part nr as key +def read_file_content_to_dict(large_file_list): + part_content_dict = {} + i = 1 + for path in large_file_list: + with path.open(encoding="utf-8") as source_file: + content = source_file.read() + part_content_dict[i] = content + i += 1 + return part_content_dict + +# create a file for each story in the right folder, using the story's name as basis for file name (transform it suitably) +# create file content using template xml and insert the right div from source files +# and title from lfb_list +def create_files(lfb_list, directory_name_base, part_content_dict): + # one file is created for each item in the list + for row in lfb_list: + name = row[0] + whole_id = row[1] + part_nr = whole_id[0] + div_id = whole_id[1:] + # remove special characters from publication names and add suffix .xml + file_name = create_file_name(name) + new_file_path = directory_name_base + part_nr + "/" + file_name + working_folder_path = XML_OUTPUT_FOLDER + new_file_path + # get the right div from the right source file + div_content = get_xml_content(part_nr, div_id, part_content_dict) + # create file content using template xml, div_content and title from list + with open(working_folder_path, "w", encoding="utf-8") as output_file: + template_soup = content_template() + # find the element where content is to be inserted + template_div = template_soup.find(type="collection") + # insert content + template_div.append(div_content) + # insert publication name as title + template_title = template_soup.find("title") + template_title.append(name) + # write to file as string + output_file.write(str(template_soup)) + # update list with the newly created file path + row = add_db_file_path_to_list(row, new_file_path) + return lfb_list + +def add_db_file_path_to_list(row, new_file_path): + db_file_path = "documents/trunk/Lasning_for_barn/" + new_file_path + row.append(db_file_path) + return row + +def content_template(): + xml_template = ''' + + + + + + + + + + + + Zacharias Topelius Skrifter + + +

+ + + + + +

+ +
+ + +
+ ''' + return BeautifulSoup(xml_template, "xml") + +def create_file_name(name): + # remove special characters from publication names + name = re.sub(r",|\.|\?|!|–|’|»|:|(|)|\[|\]|&", "", name).strip() + name = name.replace(" ", "_").lower() + name = name.replace("-", "_") + name = name.replace("ä", "a") + name = name.replace("å", "a") + name = name.replace("ö", "o") + name = name.replace("é", "e") + name = name.replace("ü", "u") + name = name.replace("æ", "ae") + # add file suffix + name = name + ".xml" + return name + + # finds and returns the right div from the right source file +def get_xml_content(part_nr, div_id, part_content_dict): + source_file_content = part_content_dict[int(part_nr)] + soup = BeautifulSoup(source_file_content, "xml") + div_content = soup.find(id=div_id) + return div_content + +# save parts of the updated list for later use +# only legacy id and file path are needed +# the file is needed for update_publication_with_filepaths.py +def write_list_to_csv(lfb_list, filename): + with open(filename, "w", encoding="utf-8") as output_file: + for row in lfb_list: + csv_row = row[3] + ";" + row[4] + "\n" + output_file.write(csv_row) + +def main(): + # the starting point is a list of all the publications for which files need to be created + lfb_list = create_list_from_csv("csv/Lfb_split.csv") + # the files are created in folders whose name consist of this string and the part nr + directory_name_base = "Lasning_for_barn_" + create_directories(directory_name_base) + large_file_list = create_file_list("Lasning_for_barn") # give path to folder with source files to be split + part_content_dict = read_file_content_to_dict(large_file_list) + lfb_list = create_files(lfb_list, directory_name_base, part_content_dict) + write_list_to_csv(lfb_list, "csv/Lfb_signum_filer.csv") + +main() \ No newline at end of file diff --git a/split_lasning_for_barn_comments.py b/split_lasning_for_barn_comments.py new file mode 100644 index 0000000..8c5b3ad --- /dev/null +++ b/split_lasning_for_barn_comments.py @@ -0,0 +1,224 @@ +""" +Script for splitting the general comments for collection Lasning for barn. +Comments were delivered as one Word file, then transformed to one XML file. +The script creates an XML file for each comment, adds the right content and saves the file path. +Comment info is then inserted into the database and connected to the right publication in the db. +Created by Anna Movall and Jonas Lillqvist in March/April 2020. +""" + +import os +from pathlib import Path +import re +import psycopg2 +from bs4 import BeautifulSoup + +conn_new_db = psycopg2.connect( + host="", + database="", + user="", + port="", + password="" +) +cursor_new = conn_new_db.cursor() + +XML_SOURCE_FILE = "" +DIRECTORY_NAME_BASE = "Lasning_for_barn_" +CSV_LIST = "csv/Lfb_split.csv" + +# creates a list from csv file with publication name, div id, publication id and legacy id +def create_list_from_csv(filename): + with open(filename, "r", encoding="utf-8") as source_file: + lfb_list = [] + for line in source_file: + row = line.rstrip() + elements = row.split(";") + lfb_list.append(elements) + return lfb_list + +# creates a folder for each of the 8 parts +def create_directories(DIRECTORY_NAME_BASE): + for i in range(1,9): + dir_name = DIRECTORY_NAME_BASE + str(i) + "_komm" + if not os.path.exists(dir_name): + os.makedirs(dir_name) + +def read_text_from_file(source_file_path): + with source_file_path.open(encoding="utf-8") as source_file: + content = source_file.read() + return content + +# save each main div, containing comments to one part, in a dictionary with the part nr as key +# the divs are saved as Beautiful Soup objects +def create_part_dict(comment_xml): + comment_soup = BeautifulSoup(comment_xml, "xml") + part_content_dict = {} + i = 1 + for element in comment_soup.body.children: + if element.name == "div": + part_content_dict[i] = element + i += 1 + return part_content_dict + +# create a file for each comment in the right folder, using the corresponding publication's name as basis for file name (transform it suitably) +# create file content using template xml and insert content from the right div in dictionary +# insert title from lfb_list +def create_files(lfb_list, DIRECTORY_NAME_BASE, part_content_dict): + # one file is created for each item in the list + for row in lfb_list: + name = row[0] + whole_id = row[1] + part_nr = whole_id[0] + # remove special characters from publication names and add suffix .xml + file_name = create_file_name(name) + new_file_path = DIRECTORY_NAME_BASE + part_nr + "_komm" + "/" + file_name + # get the right div as a soup object from the right source file + div_content = get_xml_content(part_nr, name, part_content_dict) + # remove head element from div_content + div_content.head.decompose() + # extract bibliography for later use + bibliography = div_content.find(rend="Litteratur") + if bibliography is not None: + bibliography.extract() + # create file content using template xml, div_content and title from list + with open(new_file_path, "w", encoding="utf-8") as output_file: + template_soup = content_template() + # find the element where content is to be inserted + template_comment_div = template_soup.find(type="comment") + # insert comment div contents without its own div + template_comment_div.append(div_content) + template_comment_div.div.unwrap() + # insert publication name as title + template_title = template_soup.find("title") + template_title.append(name) + # insert bibliography + if bibliography is not None: + template_bibl_div = template_soup.find(type="bibl") + template_bibl_div.append(bibliography) + # write to file as string + output_file.write(str(template_soup)) + # update list with the newly created file path + row = add_db_file_path_to_list(row, new_file_path) + return lfb_list + +# adds xml file path to one row in list of comment data +# it will later be inserted in the db +def add_db_file_path_to_list(row, new_file_path): + db_file_path = "documents/Redaktionella_texter/Kommentarer/Lasning_for_barn/" + new_file_path + row.append(db_file_path) + return row + +def content_template(): + xml_template = ''' + + + + + + + + + + + + Zacharias Topelius Skrifter + + +

+ + + + + +

+ + +
+
+
+
+
+ + +
+ ''' + return BeautifulSoup(xml_template, "xml") + +# creates comment file name using publication name as starting point +def create_file_name(name): + # remove special characters from publication names + name = re.sub(r",|\.|\?|!|–|’|»|:|(|)|\[|\]|&", "", name).strip() + name = name.replace(" ", "_").lower() + name = name.replace("-", "_") + name = name.replace("ä", "a") + name = name.replace("å", "a") + name = name.replace("ö", "o") + name = name.replace("é", "e") + name = name.replace("ü", "u") + name = name.replace("æ", "ae") + # add file suffix + name = name + "_komm.xml" + return name + + # finds and returns the right comment div from dictionary + # the head element in the comment div contains the commented publication's name + # it should match the name of the publication from the list +def get_xml_content(part_nr, name, part_content_dict): + part_div = part_content_dict[int(part_nr)] + comments = part_div.select("div > div") + comment_div = None + for comment in comments: + main_title = comment.head.get_text() + if main_title.lower() == name.lower(): + comment_div = comment + break + return comment_div + +# writes parts of the updated list to file for later use +# only legacy id and file path are needed +def write_list_to_csv(lfb_list, filename): + with open(filename, "w", encoding="utf-8") as output_file: + for row in lfb_list: + csv_row = row[3] + ";" + row[4] + "\n" + output_file.write(csv_row) + +# in order to update the db we need the new publication id +def get_id_from_publication(legacy_id): + fetch_query = """SELECT id FROM publication WHERE legacy_id = %s""" + cursor_new.execute(fetch_query, (legacy_id,)) + publication_id = cursor_new.fetchone() + return publication_id + +# insert comment data into table publication_comment +# then update table publication with the comment id +def create_comment_data(lfb_list): + for row in lfb_list: + legacy_id = row[3] + filepath = row[4] + published = 1 # published internally + publication_id = get_id_from_publication(legacy_id) + insert_query = """INSERT INTO publication_comment(published, legacy_id, original_filename) VALUES (%s, %s, %s) RETURNING id""" + values_to_insert = (published, legacy_id, filepath) + cursor_new.execute(insert_query, values_to_insert) + # get newly created comment id + comment_id = cursor_new.fetchone()[0] + # update table publication with the comment id for this publication + update_query = """UPDATE publication SET publication_comment_id = %s WHERE id = %s""" + values_to_insert = (comment_id, publication_id) + cursor_new.execute(update_query, values_to_insert) + conn_new_db.commit() + conn_new_db.close() + cursor_new.close() + +def main(): + # the starting point is a list of all the publications for which comment files need to be created + lfb_list = create_list_from_csv(CSV_LIST) + # the files are created in folders whose name consist of this string and the part nr + create_directories(DIRECTORY_NAME_BASE) + source_file_path = Path(XML_SOURCE_FILE) + comment_xml = read_text_from_file(source_file_path) + part_content_dict = create_part_dict(comment_xml) + lfb_list = create_files(lfb_list, DIRECTORY_NAME_BASE, part_content_dict) + write_list_to_csv(lfb_list, "csv/Lfb_kommentarer_filer.csv") + create_comment_data(lfb_list) + +main() \ No newline at end of file diff --git a/update_comment_with_lasning_for_barn.py b/update_comment_with_lasning_for_barn.py new file mode 100644 index 0000000..b7725b3 --- /dev/null +++ b/update_comment_with_lasning_for_barn.py @@ -0,0 +1,41 @@ +""" +Script used for updating table document in db topelius_notes with filepaths to comments for Lasning for barn. +Created by Anna Movall and Jonas Lillqvist in May 2020. +""" + +import mysql.connector + +conn_old_db = mysql.connector.connect( + host="", + database="", + user="", + passwd="" +) +cursor_old = conn_old_db.cursor() + +# creates a list from csv file with legacy id and file path +def create_list_from_csv(filename): + with open(filename, "r", encoding="utf-8") as source_file: + lfb_list = [] + for line in source_file: + row = line.rstrip() + elements = row.split(";") + lfb_list.append(elements) + return lfb_list + +def insert_document(filepath, title): + insert_query = """INSERT INTO document(path, title) VALUES(%s, %s)""" + values = (filepath, title) + cursor_old.execute(insert_query, values) + +def main(): + filepath_list = create_list_from_csv("csv/Lfb_signum_filer.csv") + for row in filepath_list: + filepath = "/" + row[1] + title = "abc" + insert_document(filepath, title) + conn_old_db.commit() + cursor_old.close() + conn_old_db.close() + +main() \ No newline at end of file diff --git a/update_manuscript_with_filepaths.py b/update_manuscript_with_filepaths.py new file mode 100644 index 0000000..5e55be8 --- /dev/null +++ b/update_manuscript_with_filepaths.py @@ -0,0 +1,123 @@ +"""Script that inserts file paths to manuscripts in table publication_manuscript. +Created by Anna Movall and Jonas Lillqvist in February 2020""" + +import psycopg2 +import json +from pathlib import Path +import re +from bs4 import BeautifulSoup + +conn_new_db = psycopg2.connect( + host="", + database="", + user="", + port="", + password="" +) +cursor_new = conn_new_db.cursor() + +def read_dict_from_file(filename): + with open(filename, encoding="utf-8") as source_file: + json_content = json.load(source_file) + return json_content + +# get relevant info from table publication_manuscript using select with collection id +def get_manuscript_info(new_collection_id): + fetch_query = """SELECT publication_manuscript.id, publication_manuscript.name FROM publication_manuscript, publication WHERE publication_collection_id = %s AND publication.id = publication_id""" + cursor_new.execute(fetch_query, (new_collection_id,)) + manuscript_info = cursor_new.fetchall() + return manuscript_info + +# create path object for folder from given filepath string, save all paths to files found in this folder or subfolders in a list +def create_file_list(filepath): + path = Path(filepath) + filelist = [] + iterate_through_folders(path, filelist) + return filelist + +# iterate through folders recursively and append filepaths to list +def iterate_through_folders(path, filelist): + for content in path.iterdir(): + if content.is_dir(): + iterate_through_folders(content, filelist) + else: + filelist.append(content) + +# loop through list of all file paths to manuscript xml files, open the files and get content of title element +# create dictionary with title as key and file path as value +# the content of the title element was used to create the name of the manuscript in the old database, therefore it can be used to match manuscripts and file paths +def create_title_path_dict(filepath_list): + manuscript_title_path_dict = {} + duplicate_titles = [] + for path in filepath_list: + with path.open(encoding="utf-8") as source_file: + soup = BeautifulSoup(source_file, 'xml') + tei_header = soup.find('teiHeader') + title = tei_header.find('title').text.strip() + # add title to dictionary, if it isn't there + if title not in manuscript_title_path_dict.keys(): + manuscript_title_path_dict[title] = path + # if the title is already in the dictionary (it is used in several files) + # add title to list of duplicates + else: + duplicate_titles.append((title, path.as_posix())) + # remove titles that are not unique (used in several files) from dictionary + # add title + filepath to list to make the list complete; these need to be checked manually + for item in duplicate_titles: + title = item[0] + if title in manuscript_title_path_dict.keys(): + dict_filepath = manuscript_title_path_dict.pop(title) + duplicate_titles.append((title, dict_filepath.as_posix())) + return manuscript_title_path_dict, duplicate_titles + +# update table publication_manuscript with original_filename +def update_publication_manuscript(manuscript_id, original_filename): + update_query = """UPDATE publication_manuscript SET original_filename = %s WHERE id = %s""" + values_to_insert = (original_filename, manuscript_id) + cursor_new.execute(update_query, values_to_insert) + +def main(): + collection_id_dict = read_dict_from_file("id_dictionaries/collection_ids.json") + # list of collections with collection id and path to folder containing manuscript files + old_collections = [(1, "../../Topelius SVN/documents/Manuskript/Ljungblommor_manuskript"), (2, "../../Topelius SVN/documents/Manuskript/Nya_blad_och_Ljung_manuskript"), (16, "../../Topelius SVN/documents/trunk/Ovrig_lyrik"), (24, "../../Topelius SVN/documents/trunk/Academica/Otryckta Academica texter"), (30, "../../Topelius SVN/documents/trunk/Brev/Forlagskorrespondens"), (17, "../../Topelius SVN/documents/trunk/Dramatik"), (19, "../../Topelius SVN/documents/Manuskript/Ovrig_barnlitteratur_manuskript"), (20, "../../Topelius SVN/documents/trunk/Forelasningar"), (29, "../../Topelius SVN/documents/trunk/Dagbocker"), (31, "../../Topelius SVN/documents/trunk/Brev/Foraldrakorrespondens"), (32, "../../Topelius SVN/documents/Manuskript/Lasning_for_barn_manuskript")] + # initialize counters for match log statistics + manuscript_count = 0 + match_count = 0 + log_found = open("logs/matched_manuscripts.txt", "w", encoding="utf-8") + log_not_found = open("logs/unmatched_manuscripts.txt", "w", encoding="utf-8") + log_not_found.write("The following manuscripts have no files connected to them.\n") + log_files_with_same_title = open("logs/manuscript_files_with_same_title.txt", "w", encoding="utf-8") + for collection in old_collections: + old_id = collection[0] + collection_path = collection[1] + new_collection_id = collection_id_dict[str(old_id)] # get new collection id using dictionary + manuscript_info = get_manuscript_info(new_collection_id) # select manuscripts with this collection id from table publication + filepath_list = create_file_list(collection_path) # create list of all manuscript file paths in this collection + # create dictionary with title from xml file as key and file path as value + manuscript_title_path_dict, duplicate_titles = create_title_path_dict(filepath_list) + for item in duplicate_titles: + log_files_with_same_title.write("TITLE: " + item[0] + " PATH: " + item[1] + "\n") + for tuple in manuscript_info: + manuscript_count += 1 + manuscript_id = tuple[0] + manuscript_name = tuple[1].strip() + # manuscript_name in database was originally created from the title element in the xml file for the manuscript + # if it matches a title (key) in the dictionary, we know it's filepath (value) + if manuscript_name in manuscript_title_path_dict.keys(): + filepath = manuscript_title_path_dict[manuscript_name] + original_filename = filepath.as_posix().replace("../../Topelius SVN/", "") # create file path string and shorten it + log_found.write("MANUSCRIPT NAME: " + manuscript_name + " MATCHED " + original_filename + "\n") + match_count += 1 + # add original_filepath for manuscript in database + update_publication_manuscript(manuscript_id, original_filename) + else: + log_not_found.write("MANUSCRIPT NAME: " + manuscript_name + " MANUSCRIPT ID: " + str(manuscript_id) + "\n") + conn_new_db.commit() + log_found.write("\nManuscripts matched: " + str(match_count) + "/" + str(manuscript_count) + ". Percentage matched: " + str(match_count/manuscript_count*100)) + log_found.close() + log_not_found.close() + log_files_with_same_title.close() + conn_new_db.close() + cursor_new.close() + +main() \ No newline at end of file diff --git a/update_notes.py b/update_notes.py new file mode 100644 index 0000000..6784d22 --- /dev/null +++ b/update_notes.py @@ -0,0 +1,93 @@ +""" +Script that updates the document_id:s in table documentnote in db topelius_notes. +Läsning för barn consisted of 8 XML files; they were split into about 300, and +the document id:s for them needed to be changed in the database containing lemmas and comments. +Created by Anna Movall and Jonas Lillqvist in May 2020. +""" + +import mysql.connector +from pathlib import Path +from bs4 import BeautifulSoup + +conn_old_db = mysql.connector.connect( + host="", + database="", + port="", + user="", + passwd="", + charset="utf8" +) +cursor_old = conn_old_db.cursor() + +OLD_DOCUMENT_ID = (4395, 4396, 4397, 4398, 4399, 4400, 4401, 4402) +XML_SOURCE_FOLDER = "Lfb_split_files" + +# from table documentnote, fetch the id for each lemma belonging to the old Lfb-files +def get_lemma_id(): + fetch_query = """SELECT id FROM documentnote WHERE document_id IN (%s, %s, %s, %s, %s, %s, %s, %s)""" + values = OLD_DOCUMENT_ID + cursor_old.execute(fetch_query, values) + lemma_id = cursor_old.fetchall() + return lemma_id + +# create path object for folder from given filepath string, save all paths to files found in this folder or subfolders in a list +def create_file_list(): + path = Path(XML_SOURCE_FOLDER) + filelist = [] + iterate_through_folders(path, filelist) + return filelist + +# iterate through folders recursively and append filepaths to list +def iterate_through_folders(path, filelist): + for content in path.iterdir(): + if content.is_dir(): + iterate_through_folders(content, filelist) + else: + filelist.append(content) + +# find out which file a lemma belongs to and return the file path for the file +def find_lemma_in_file(lemma_id, xml_filepath_list): + xml_id_value = "start" + str(lemma_id) + for filepath in xml_filepath_list: + with filepath.open(encoding="utf-8") as xml_file: + soup = BeautifulSoup(xml_file, "xml") + anchor = soup.find(attrs={"xml:id" : xml_id_value}) + if anchor: + return(filepath) + return False + +# get document_id from db using file path +def fetch_document_id(filepath): + fetch_query = """SELECT id FROM document WHERE path = %s""" + value = (filepath,) + cursor_old.execute(fetch_query, value) + document_id = cursor_old.fetchone()[0] + if document_id: + return document_id + print(filepath, "not found") + return False + +# update table documentnote with the new document_id for each lemma +def update_document_id(new_document_id, lemma_id): + update_query = """UPDATE documentnote SET document_id = %s WHERE id = %s""" + values_to_insert = (new_document_id, lemma_id) + cursor_old.execute(update_query, values_to_insert) + +def main(): + lemma_ids = get_lemma_id() + xml_filepath_list = create_file_list() + for lemma_id in lemma_ids: + filepath = find_lemma_in_file(lemma_id[0], xml_filepath_list) + if filepath: + folder = filepath.parts[1] + filename = filepath.parts[2] + filepath = "/documents/trunk/Lasning_for_barn/" + folder + "/" + filename + new_document_id = fetch_document_id(filepath) + if new_document_id: + update_document_id(new_document_id, lemma_id[0]) + print(filepath, new_document_id) + conn_old_db.commit() + cursor_old.close() + conn_old_db.close() + +main() \ No newline at end of file diff --git a/update_publication_with_filepaths.py b/update_publication_with_filepaths.py new file mode 100644 index 0000000..62bb152 --- /dev/null +++ b/update_publication_with_filepaths.py @@ -0,0 +1,165 @@ +"""Script that inserts file paths to reading text in table publication. +Created by Anna Movall and Jonas Lillqvist in February 2020""" + +import mysql.connector +import psycopg2 +import json +from pathlib import Path +import re +from fuzzywuzzy import fuzz + +from create_comment_data import create_file_list +from create_comment_data import iterate_through_folders +from create_comment_data import read_dict_from_file +from create_comment_data import compare_pubnames_with_filenames + +conn_old_db = mysql.connector.connect( + host="", + database="", + user="", + passwd="" +) +cursor_old = conn_old_db.cursor() + +conn_new_db = psycopg2.connect( + host="", + database="", + user="", + port="", + password="" +) +cursor_new = conn_new_db.cursor() + +# get relevant info from publication table using select with collection id +def get_publication_info(new_collection_id): + fetch_query = """SELECT id, name, legacy_id FROM publication WHERE publication_collection_id = %s""" + cursor_new.execute(fetch_query, (new_collection_id,)) + publication_info = cursor_new.fetchall() + return publication_info + +# compare letters' identifiers with all of the collection's file names (containing the same identifiers) to find out each letter's original file path +def compare_letters_with_filenames(publication_id, filepath_list, match_count, publication_count): + publication_id_dict = read_dict_from_file("id_dictionaries/publication_ids.json") + fetch_query = """SELECT p_FM from publications WHERE p_id = %s""" + # use new publication id to find out old id using id dictionary + # then use old id to fetch letter identifier from old database + for key, value in publication_id_dict.items(): + if value == publication_id: + old_publication_id = int(key) + cursor_old.execute(fetch_query, (old_publication_id,)) + signum = cursor_old.fetchone()[0] + if signum is None or signum == "": + original_path = None + break + signum = signum.strip() + found = False + i = 0 + # for filepath in filepath_list: + while found == False and i < len(filepath_list): + original_path = filepath_list[i] + # get filename without suffix + filepath = filepath_list[i].stem + # most letter filepaths contain an identifier in this form: + search_str = re.compile(r"Br\d{1,4}$") + # search for an identifier in the filepath + match_str = re.search(search_str, filepath) + # if the file path contains no identifier, skip to next file path in the list + if match_str is None: + i += 1 + continue + # if the identifier is found, save the matched string in a variable + match_str = match_str.group(0) + # compare the matched string to identifier from old database + if match_str == signum: + found = True + match_count += 1 + break # exit loop if the two values match + i += 1 + if not found: + original_path = None + publication_count += 1 + return original_path, match_count, publication_count + +# update table publication with original_filename for each publication, if the file name has been found +def update_publication(log_found, publication_name, original_filename, publication_id): + log_found.write("PUBLICATION: " + publication_name + " MATCHED " + original_filename + "\n") + update_query = """UPDATE publication SET original_filename = %s WHERE id = %s""" + values_to_insert = (original_filename, publication_id) + cursor_new.execute(update_query, values_to_insert) + +# reads csv and creates dictionary for update of table publication with original_filename, for collection Publicistik, Forelasningar and Lasning for barn +def create_dict_from_csv(filename): + with open(filename, encoding="utf-8") as source_file: + rows = source_file.readlines() + info_dict = {} + for row in rows: + row = row.rstrip() + elements = row.split(";") + info_dict[elements[0]] = elements[1] + return(info_dict) + +def main(): + collection_id_dict = read_dict_from_file("id_dictionaries/collection_ids.json") + old_collections = [(1, "../../Topelius SVN/documents/trunk/Ljungblommor"), (2, "../../Topelius SVN/documents/trunk/Nya_blad_och_Ljung"), (4, "../../Topelius SVN/documents/trunk/Noveller"), (5, "../../Topelius SVN/documents/trunk/Hertiginnan_af_Finland_och_andra_historiska_noveller"), (7, "../../Topelius SVN/documents/trunk/Vinterqvallar"), (12, "../../Topelius SVN/documents/trunk/Finland_framstalldt_i_teckningar"), (16, "../../Topelius SVN/documents/trunk/Ovrig_lyrik"), (18, "../../Topelius SVN/documents/trunk/Noveller_och_kortprosa"), (24, "../../Topelius SVN/documents/trunk/Academica"), (30, "../../Topelius SVN/documents/trunk/Brev/Forlagskorrespondens"), (6, "../../Topelius SVN/documents/trunk/Faltskarns_berattelser"), (8, "../../Topelius SVN/documents/trunk/Planeternas_skyddslingar"), (10, "../../Topelius SVN/documents/trunk/Naturens_bok_och_Boken_om_vart_land"), (13, "../../Topelius SVN/documents/trunk/En_resa_i_Finland"), (17, "../../Topelius SVN/documents/trunk/Dramatik"), (19, "../../Topelius SVN/documents/trunk/Ovrig_barnlitteratur"), (20, "../../Topelius SVN/documents/trunk/Forelasningar"), (22, "../../Topelius SVN/documents/trunk/Finland_i_19de_seklet"), (23, "../../Topelius SVN/documents/trunk/Publicistik"), (26, "../../Topelius SVN/documents/trunk/Religiosa_skrifter_och_psalmer"), (29, "../../Topelius SVN/documents/trunk/Dagbocker"), (31, "../../Topelius SVN/documents/trunk/Brev/Foraldrakorrespondens"), (32, "../../Topelius SVN/documents/trunk/Lasning_for_barn")] + # initialize counters for match log statistics + publication_count = 0 + match_count = 0 + # create log files + log_found = open("logs/matched_reading_texts.txt", "w", encoding="utf-8") + log_not_found = open("logs/unmatched_reading_texts.txt", "w", encoding="utf-8") + # loop through collections and publications in them + for collection in old_collections: + old_id = collection[0] + collection_path = collection[1] + new_collection_id = collection_id_dict[str(old_id)] # get new collection id using dictionary + publication_info = get_publication_info(new_collection_id) # select publications with this collection id from table publication + filepath_list = create_file_list(collection_path) + for tuple in publication_info: + publication_name = tuple[1] + publication_id = tuple[0] + legacy_id = tuple[2] + if old_id < 30 and old_id != 23 and old_id != 20: # don't use this comparison function for Brev, Publicistik, Forelasningar + filepath, match_count, publication_count = compare_pubnames_with_filenames(publication_name, filepath_list, match_count, publication_count) + # if the publication has a matching file path, update table publication and write match to log file + if filepath is not None: + original_filename = filepath.as_posix().replace("../../Topelius SVN/", "") # create file path string and shorten it + update_publication(log_found, publication_name, original_filename, publication_id) + # if no matching file path was found, write this to log file + else: + log_not_found.write("Publication name: " + publication_name + "\n") + elif old_id == 30 or old_id == 31: # Brev have their own comparison function; otherwise the same as above + filepath, match_count, publication_count = compare_letters_with_filenames(publication_id, filepath_list, match_count, publication_count) + if filepath is not None: + original_filename = filepath.as_posix().replace("../../Topelius SVN/", "") # create file path string and shorten it + update_publication(log_found, publication_name, original_filename, publication_id) + else: + log_not_found.write("Publication name: " + publication_name + "\n") + elif old_id == 23: # matching file paths for Publicistik are kept in a separate document + publicistik_info_dict = create_dict_from_csv("csv/ZTS_Publicistik_verk_signum_filer.csv") + # get file name from dictionary using legacy_id + if legacy_id in publicistik_info_dict.keys(): + filename = publicistik_info_dict[legacy_id] + year = filename[0:4] # get year from file name and use it as folder name + original_filename = "documents/trunk/Publicistik/" + year + "/" + filename + update_publication(log_found, publication_name, original_filename, publication_id) + else: + log_not_found.write("Publication name: " + publication_name + "\n") + elif old_id == 20: # matching file paths for Forelasningar are kept in a separate document; they are all there; otherwise as above + forelasningar_info_dict = create_dict_from_csv("csv/Forelasningar_signum_filer.csv") + filename = forelasningar_info_dict[legacy_id] + original_filename = "documents/trunk/Forelasningar/" + filename + update_publication(log_found, publication_name, original_filename, publication_id) + elif old_id == 32: # matching file paths for Lasning for barn are kept in a separate document + lfb_info_dict = create_dict_from_csv("csv/Lfb_signum_filer.csv") + original_filename = lfb_info_dict[legacy_id] + update_publication(log_found, publication_name, original_filename, publication_id) + conn_new_db.commit() + log_found.write("\nPublications matched: " + str(match_count) + "/" + str(publication_count) + ". Percentage matched: " + str(match_count/publication_count*100)) + log_found.close() + log_not_found.close() + conn_new_db.close() + cursor_new.close() + conn_old_db.close() + cursor_old.close() + +main() \ No newline at end of file diff --git a/update_version_with_filepaths.py b/update_version_with_filepaths.py new file mode 100644 index 0000000..f160a93 --- /dev/null +++ b/update_version_with_filepaths.py @@ -0,0 +1,200 @@ +""" +Script that updates table publication_version with file paths for the version's original xml file. +Created by Anna Movall and Jonas Lillqvist in February/March 2020. +""" + +import psycopg2 +import json +from pathlib import Path +import re +from bs4 import BeautifulSoup +from fuzzywuzzy import fuzz + +conn_new_db = psycopg2.connect( + host="", + database="", + user="", + port="", + password="" +) +cursor_new = conn_new_db.cursor() + +def read_dict_from_file(filename): + with open(filename, encoding="utf-8") as source_file: + json_content = json.load(source_file) + return json_content + +# get relevant info from table publication_version using select with collection id +def get_version_info(new_collection_id): + fetch_query = """SELECT publication_version.id, publication.name, publication_version.legacy_id, publication_version.name FROM publication_version, publication WHERE publication_collection_id = %s AND publication.id = publication_id""" + cursor_new.execute(fetch_query, (new_collection_id,)) + version_info = cursor_new.fetchall() + return version_info + +# create path object for folder from given filepath string, save all paths to files found in this folder or subfolders in a list +def create_file_list(filepath): + path = Path(filepath) + filelist = [] + iterate_through_folders(path, filelist) + return filelist + +# iterate through folders recursively and append filepaths to list +def iterate_through_folders(path, filelist): + for content in path.iterdir(): + if content.is_dir(): + iterate_through_folders(content, filelist) + elif content.suffix == ".xml": + filelist.append(content) + +# opens an xml file from the web sever (the script uses a local copy of the files) +# extracts its body element and removes added attributes to make it as similar as possible to the original xml file +# returns the body for comparison +def get_body_from_web_xml(filepath): + with open(filepath, encoding="utf-8") as source_file: + soup = BeautifulSoup(source_file, "xml") + body = soup.body + for tag in body.find_all("l"): + del tag["n"] + for tag in body.find_all("lg"): + del tag["xml:id"] + for tag in body.find_all("p"): + del tag["xml:id"] + return str(body) + +# opens an original xml file from SVN and extracts its body element +# returns the body for comparison +def get_body_from_xml(filepath): + with filepath.open(encoding="utf-8", errors='ignore') as source_file: + soup = BeautifulSoup(source_file, "xml") + body = soup.body + return str(body) + +# opens an original xml file from SVN and extracts the text content of its title element +def get_title_from_xml(filepath): + with filepath.open(encoding="utf-8", errors='ignore') as source_file: + soup = BeautifulSoup(source_file, "xml") + title = soup.title.get_text() + return title + +# updates table publication_version with original_filename +def update_publication_version(version_id, original_filename): + update_query = """UPDATE publication_version SET original_filename = %s WHERE id = %s""" + values_to_insert = (original_filename, version_id) + cursor_new.execute(update_query, values_to_insert) + +# receives the publication name connected to the version and compares it to the folder names in the list of all file paths for this collection +# matching paths are added to a list +# we use partial match because folder names are sometimes shortened or altered versions of the publication name +def compare_pub_name_with_directories(pub_name, filepath_list): + # remove special characters from publication names + search_str = re.sub(r",|\.|\?|!|–|’|»|:|(|)|\[|\]|&", "", pub_name).strip() + search_str = search_str.replace(" ", "_").lower() + search_str = search_str.replace("-", "_") + search_str = search_str.replace("ä", "a") + search_str = search_str.replace("å", "a") + search_str = search_str.replace("ö", "o") + search_str = search_str.replace("é", "e") + search_str = search_str.replace("ü", "u") + search_str = search_str.replace("æ", "ae") + i = 0 + match_list = [] + while i < len(filepath_list): + original_path = filepath_list[i] + dir_name = original_path.parts[-2].lower() # gets the last directory in the file path, this folder contains the versions of a specific publication + match_ratio = fuzz.partial_ratio(search_str, dir_name) # compares publication name and folder name + if match_ratio == 100: + match_list.append(original_path) # appends possible original paths for this version to a list + i += 1 + return match_list + +# check for duplicate file paths in log for matched versions; every file path should appear only once +# duplicates are a sign of corrupt data that needs to be corrected manually +def check_for_duplicate_file_paths(): + log_duplicate_matched_versions = open("logs/duplicate_matched_versions.txt", "w", encoding="utf-8") + with open("logs/matched_versions.txt", "r", encoding="utf-8") as source_file: + all_text = source_file.read() + regex = re.compile(r"ORIGINAL PATH: .*?\.xml") + list_of_original_paths = re.findall(regex, all_text) + while len(list_of_original_paths) > 0: + last_item = list_of_original_paths.pop() + if last_item in list_of_original_paths: + log_duplicate_matched_versions.write("Duplicate: " + last_item + "\n") + log_duplicate_matched_versions.close() + +def main(): + collection_id_dict = read_dict_from_file("id_dictionaries/collection_ids.json") + # list of collections with collection id and path to folder containing version files + old_collections = [(32, "../../Topelius SVN/documents/Varianter/Lasning_for_barn_varianter")] + # initialize counters for match log statistics + version_count = 0 + match_count = 0 + log_directory_not_found = open("logs/version_directory_not_found_lfb.txt", "w", encoding="utf-8") + log_directory_found = open("logs/version_directory_found_lfb.txt", "w", encoding="utf-8") + log_matched_versions = open("logs/matched_versions_lfb.txt", "w", encoding="utf-8") + log_unmatched_versions = open("logs/unmatched_versions_lfb.txt", "w", encoding="utf-8") + for collection in old_collections: + old_id = collection[0] + collection_path = collection[1] + new_collection_id = collection_id_dict[str(old_id)] # get new collection id using dictionary + version_info = get_version_info(new_collection_id) + filepath_list = create_file_list(collection_path) # create list of all version file paths in this collection + for tuple in version_info: + version_count += 1 + version_id = tuple[0] + pub_name = tuple[1] + web_xml_filepath = "var/" + tuple[2] + version_name = tuple[3].strip() + original_path_list = compare_pub_name_with_directories(pub_name, filepath_list) + # if no directory and thus no possible file paths were found for this version: + if len(original_path_list) == 0: + log_directory_not_found.write("PUBLICATION NAME: " + pub_name + " VERSION ID: " + str(version_id) + "\n") + else: + log_directory_found.write("PUBLICATION NAME: " + pub_name + " VERSION ID: " + str(version_id) + "\nPATH LIST: " + "\n") + for path in original_path_list: + log_directory_found.write(path.as_posix() + "\n") + # for each file path with a folder name matching the publication name, get the content of the title element of the file and compare it with the version name + filepath_match_list = [] + for path in original_path_list: + title = get_title_from_xml(path).strip() + # use for most collections: + #if title == version_name: + #use for lasning for barn: + version_name = version_name.replace(",", "") + title = title.replace("(", "") + title = title.replace(")", "") + title = title.replace(",", "") + if version_name in title: + filepath_match_list.append(path) + found = False + # if title matched version_name just once, we have found the file path + if len(filepath_match_list) == 1: + original_filepath = filepath_match_list[0] + found = True + # if title matched version_name more than once, we need to compare the content of the web xml file and the original (SVN) files in order to find the correct file path + elif len(filepath_match_list) > 1: + web_xml_body = get_body_from_web_xml(web_xml_filepath) + for path in filepath_match_list: + content = get_body_from_xml(path) + score = fuzz.ratio(web_xml_body, content) + if score >= 90: + original_filepath = path + found = True + # if we have found a file path for the version, update table publication_version with original_filename + if found: + match_count += 1 + original_filename = original_filepath.as_posix().replace("../../Topelius SVN/", "") # shorten file path string + update_publication_version(version_id, original_filename) + log_matched_versions.write("\nPUBLICATION NAME: " + pub_name + " WEB XML PATH: " + web_xml_filepath + "\nORIGINAL PATH: " + original_filename) + else: + log_unmatched_versions.write("\nPUBLICATION NAME: " + pub_name + " WEB XML PATH: " + web_xml_filepath) + conn_new_db.commit() + log_matched_versions.write("\nVersions matched: " + str(match_count) + "/" + str(version_count) + ". Percentage matched: " + str(match_count/version_count*100)) + log_directory_found.close() + log_directory_not_found.close() + log_matched_versions.close() + log_unmatched_versions.close() + check_for_duplicate_file_paths() + conn_new_db.close() + cursor_new.close() + +main() \ No newline at end of file