From 394511d315bd4f4a10bb2f7d0009ee6fa1fc925c Mon Sep 17 00:00:00 2001 From: tpr Date: Wed, 19 Oct 2022 00:04:26 -0500 Subject: [PATCH] draft, untested ProjectCommitData aligner --- prism/data/repair/align.py | 51 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) create mode 100644 prism/data/repair/align.py diff --git a/prism/data/repair/align.py b/prism/data/repair/align.py new file mode 100644 index 00000000..bde31b73 --- /dev/null +++ b/prism/data/repair/align.py @@ -0,0 +1,51 @@ +from prism.data.document import CoqDocument +from prism.data.extract_cache import _extract_vernac_commands, extract_cache +from prism.language.gallina.parser import CoqParser +from prism.project.base import SEM, Project +from prism.project.repo import ProjectRepo +from prism.tests import _COQ_EXAMPLES_PATH, _PROJECT_EXAMPLES_PATH +from prism.util.radpytools.os import pushd +from prism.data.build_cache import VernacSentence, ProjectCommitData +from typing import List + +""" +with pushd(_COQ_EXAMPLES_PATH): + extracted_commands = _extract_vernac_commands( + Project.extract_sentences( + CoqDocument("Alphabet.v", + CoqParser.parse_source("Alphabet.v"), + _COQ_EXAMPLES_PATH), + sentence_extraction_method=SEM.HEURISTIC, + return_locations=True, + glom_proofs=False), + serapi_options="") +""" + +from prism.util.alignment import lazy_align, fast_edit_distance + +def normalized_string_alignment(a : str,b : str): + cost, _ = fast_edit_distance(a,b, return_cost=True) + return cost/max(len(a),len(b)) + +def file_alignment(a: List[VernacSentence],b : List[VernacSentence]): + return lazy_align(range(len(a)),range(len(b)), + lambda x,y: normalized_string_alignment(a[x].text,b[x].text), + lambda x: 0.75) + # the last, fixed value is a hyperparameter tradeoff between skipping and mis-matching + # a value of 1.0 always mismatches and a value of 0.0 always skips. + + +def align_commits(a : ProjectCommitData, b : ProjectCommitData): + """ + Totally untested draft function! + Don't merge this! + """ + # only attempt to align files present in both roots. + alignable_files = a.command_data.keys()&b.command_data.keys() + aligned_files = {} + for f in alignable_files: + a_sentences = [x.command for x in a.command_data[f]] + b_sentences = [x.command for x in b.command_data[f]] + aligned_files[f] = file_alignment(a_sentences,b_sentences) + + return aligned_files