From 251f1e2ffb14cdfeeebde0cfbc5828fc236ef11b Mon Sep 17 00:00:00 2001
From: Jelle Meeus <jellemeeus@hotmail.com>
Date: Tue, 26 Mar 2024 15:55:38 +0100
Subject: [PATCH] rename

---
 src/format_english.ipynb |  2 +-
 src/format_english.py    |  2 +-
 src/format_spanish.ipynb |  2 +-
 src/format_spanish.py    | 75 +++-------------------------------------
 src/utils/dataset.py     | 33 ++++++++++++++++--
 5 files changed, 39 insertions(+), 75 deletions(-)
diff --git a/src/format_english.ipynb b/src/format_english.ipynb
index 1db8fde..c0b04c3 100644
--- a/src/format_english.ipynb
+++ b/src/format_english.ipynb
@@ -199,7 +199,7 @@
    "outputs": [],
    "source": [
     "from utils.utils import replace_word_in_field_with_underscore\n",
-    "from utils.dataset import load_english as load_data"
+    "from utils.dataset import load_data"
    ]
   },
   {
diff --git a/src/format_english.py b/src/format_english.py
index 8e4437d..0efa199 100644
--- a/src/format_english.py
+++ b/src/format_english.py
@@ -22,7 +22,7 @@
 DATASET = args.dataset
 IS_GENERATE_PDF = args.generate_pdf
 
-load_data = EnglishData(DATASET).load_english
+load_data = EnglishData(DATASET).load_data
 
 # %% [markdown]
 # ## HTML+PDF all columns alphabetical
diff --git a/src/format_spanish.ipynb b/src/format_spanish.ipynb
index aad9c5b..deeef6b 100644
--- a/src/format_spanish.ipynb
+++ b/src/format_spanish.ipynb
@@ -274,7 +274,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from utils.dataset import load_spanish2 as load_data2"
+    "from utils.dataset import load_data as load_data2"
    ]
   },
   {
diff --git a/src/format_spanish.py b/src/format_spanish.py
index f68e01e..7a7ea99 100644
--- a/src/format_spanish.py
+++ b/src/format_spanish.py
@@ -1,9 +1,3 @@
-# %%
-import re
-
-import pandas as pd
-
-# %%
 import argparse
 
 from utils.dataset import SpanishData
@@ -17,7 +11,8 @@
 DATASET = args.dataset
 IS_GENERATE_PDF = args.generate_pdf
 
-load_data = SpanishData(DATASET).load_spanish
+load_data = SpanishData(DATASET).load_data
+cefrs_data_by_dataset = SpanishData(DATASET).cefrs_data_by_dataset
 
 # HTML, Underscore, Shuffled and Alphabetical
 def format_html_all_columns(is_shuffle=True, is_alphabetical=False, with_underscore=True):
@@ -62,38 +57,9 @@ def format_html_all_columns(is_shuffle=True, is_alphabetical=False, with_undersc
     f.write('<meta charset="UTF-8">'+html)
 
 filenames = [DATASET+'_alphabetical', DATASET+'_shuffled', DATASET+'_underscore_shuffled', DATASET + '_underscore_alphabetical']
-#for filename in filenames:
-#    cmd = f'pandoc -f html -t pdf output/{filename}.html -t html5 -o output/{filename}.pdf --metadata pagetitle="{filename}" -V margin-top=1cm -V margin-bottom=1cm -V margin-left=1cm -V margin-right=1cm -c format/table.css '
-#    os.system(cmd)
 
 # ## HTML+PDF all columns grouped by CEFR
 
-# By Ranking / pseudo-cefr. Shuffled
-def cefrs_data_by_dataset(data):
-    if DATASET=='oxford_3000':
-        cefrs = ['A1', 'A2', 'B1']
-        data_by_cefr = [
-            data.iloc[:1000],
-            data.iloc[1000:2000],
-            data.iloc[2000:],
-            ]
-    elif DATASET=='oxford_5000_exclusive':
-        cefrs = ['B2', 'C1']
-        data_by_cefr = [
-            data.iloc[:1000],
-            data.iloc[1000:],
-            ]
-    else:
-        cefrs = ['A1', 'A2', 'B1', 'B2', 'C1']
-        data_by_cefr = [
-            data.iloc[:1000],
-            data.iloc[1000:2000],
-            data.iloc[2000:3000],
-            data.iloc[3000:4000],
-            data.iloc[4000:],
-            ]
-    return cefrs, data_by_cefr
-
 # Complete to HTML
 def format_html_columns_by_cefr(is_shuffle=True, is_alphabetical=False, with_underscore=True):
     data = load_data()
@@ -104,7 +70,6 @@ def format_html_columns_by_cefr(is_shuffle=True, is_alphabetical=False, with_und
     cefrs, data_by_cefr = cefrs_data_by_dataset(data)
 
     data_by_cefr[1].head()
-    cefrs = ['A1', 'A2', 'B1', 'B2', 'C1']
 
     html_out = ''
     for i, data_slice in enumerate(data_by_cefr):
@@ -146,17 +111,7 @@ def format_html_columns_by_cefr(is_shuffle=True, is_alphabetical=False, with_und
 # shuffle and alphabetical
 # Fix supertabular and add \textit to type
 column_format = 'p{1.2in}p{2.3in}p{1.2in}p{2.3in}'
-def fix_latex_line(line):
-    if re.match(r"^\\begin{supertabular}", line):
-        # Add column_format to supertabular}
-        return '\\begin{supertabular}'+'{'+column_format+'}'
-    if re.match(r"^\\.*{tabular}", line):
-        # Remove {tabular}
-        return ''
-    if re.match(r"^\w+\s.*\(\w+\s?\w+?\)", line):
-        # Italics
-        return re.sub(r"(^\w+\s.*)(\(\w+\s?\w+?\))", r"\1\\textit{\2}", line)
-    return line
+fix_latex_line = FixLatexLine(column_format).fix_latex_line
 
 def format_latex_columns(is_alphabetical=False, is_shuffle=True):
     # columns = ["word", "type", "english", "frequency_rank"]
@@ -197,17 +152,7 @@ def format_latex_columns(is_alphabetical=False, is_shuffle=True):
 # two column, cefr by rank
 # Fix supertabular and add \textit to type
 column_format = 'p{1.2in}p{2.3in}p{1.2in}p{2.3in}'
-def fix_latex_line(line):
-    if re.match(r"^\\begin{supertabular}", line):
-        # Add column_format to supertabular}
-        return '\\begin{supertabular}'+'{'+column_format+'}'
-    if re.match(r"^\\.*{tabular}", line):
-        # Remove {tabular}
-        return ''
-    if re.match(r"^\w+\s.*\(\w+\s?\w+?\)", line):
-        # Italics
-        return re.sub(r"(^\w+\s.*)(\(\w+\s?\w+?\))", r"\1\\textit{\2}", line)
-    return line
+fix_latex_line = FixLatexLine(column_format).fix_latex_line
 
 def format_latex_columns_by_cefr(is_alphabetical=True, is_shuffle=False):
     data = load_data()
@@ -260,17 +205,7 @@ def format_latex_columns_by_cefr(is_alphabetical=True, is_shuffle=False):
 #column_format = 'p{1.0in}p{3.0in}p{3.0in}' # total 8.3in - 0.7874in - column_width
 #column_format = 'p{0.8in}p{1.1in}p{2.55in}p{2.55in}' # total 8.3in - 0.7874in - column_width
 column_format = 'p{0.9in}p{1.0in}p{2.8in}p{2.30in}' # total 8.3in - 0.7874in - column_width
-def fix_latex_line(line):
-    if re.match(r"^\\begin{supertabular}", line):
-        # Add column_format to supertabular}
-        return '\\begin{supertabular}'+'{'+column_format+'}'
-    if re.match(r"^\\.*{tabular}", line):
-        # Remove {tabular}
-        return ''
-    if re.match(r"^\w+\s.*\(\w+\s?\w+?\)", line):
-        # Italics
-        return re.sub(r"(^\w+\s.*)(\(\w+\s?\w+?\))", r"\1\\textit{\2}", line)
-    return line
+fix_latex_line = FixLatexLine(column_format).fix_latex_line
 
 def format_latex_columns_by_cefr_with_example(is_alphabetical=True, is_shuffle=False):
     data = load_data()
diff --git a/src/utils/dataset.py b/src/utils/dataset.py
index 5f5aee1..33ba37d 100644
--- a/src/utils/dataset.py
+++ b/src/utils/dataset.py
@@ -4,7 +4,7 @@ class EnglishData:
     def __init__(self, dataset: str):
         self.dataset = dataset
 
-    def load_english(self):
+    def load_data(self):
         dataset = self.dataset
         df = pd.read_pickle(f"./data/english/{dataset}.pkl")
         df.head()
@@ -27,4 +27,33 @@ def load_data(self):
             data = data.iloc[:3000] # A1, A2, B1
         elif dataset == 'spanish_5000_exclusive':
             data = data.iloc[3000:] # B2, C1
-        return data
\ No newline at end of file
+        return data
+
+    # By Ranking / pseudo-cefr. Shuffled
+    def cefrs_data_by_dataset(self, data):
+        DATASET = self.dataset
+        if DATASET=='spanish_3000':
+            cefrs = ['A1', 'A2', 'B1']
+            data_by_cefr = [
+                data.iloc[:1000],
+                data.iloc[1000:2000],
+                data.iloc[2000:],
+                ]
+        elif DATASET=='spanish_5000_exclusive':
+            cefrs = ['B2', 'C1']
+            data_by_cefr = [
+                data.iloc[:1000],
+                data.iloc[1000:],
+                ]
+        else:
+            cefrs = ['A1', 'A2', 'B1', 'B2', 'C1']
+            data_by_cefr = [
+                data.iloc[:1000],
+                data.iloc[1000:2000],
+                data.iloc[2000:3000],
+                data.iloc[3000:4000],
+                data.iloc[4000:],
+                ]
+        return cefrs, data_by_cefr
+
+    # Complete to HTML
\ No newline at end of file