Skip to content

Commit

Permalink
Space Manipulation (#44)
Browse files Browse the repository at this point in the history
* error wrapper because it makes things so much nicer

* Space manipulation

* version bump
  • Loading branch information
mr-martian authored Oct 18, 2021
1 parent 37c26cc commit 27b8b6a
Show file tree
Hide file tree
Showing 9 changed files with 175 additions and 95 deletions.
2 changes: 1 addition & 1 deletion configure.ac
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ AC_PREREQ(2.61)
m4_define([required_libxml_version], [2.6.17])
m4_define([required_lttoolbox_version], [3.6.0])

AC_INIT([apertium-separable], [0.4.0], [apertium-stuff@lists.sourceforge.net])
AC_INIT([apertium-separable], [0.5.0], [apertium-stuff@lists.sourceforge.net])
AM_INIT_AUTOMAKE
AC_CONFIG_MACRO_DIR([m4])

Expand Down
101 changes: 46 additions & 55 deletions src/lsx_compiler.cc
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,14 @@ using namespace std;

UString const Compiler::COMPILER_ANYTAG_ELEM = "t"_u;
UString const Compiler::COMPILER_ANYCHAR_ELEM = "w"_u;
UString const Compiler::COMPILER_WB_ELEM = "j"_u;
UString const Compiler::COMPILER_WB_ELEM = "d"_u;
UString const Compiler::COMPILER_SPACE_ATTR = "space"_u;
UString const Compiler::COMPILER_SPACE_YES_VAL = "yes"_u;
UString const Compiler::COMPILER_SPACE_NO_VAL = "no"_u;

// TODO: these should be in lttoolbox so lt-trim can use them
UString const Compiler::SYMBOL_WB_SPACE = "<$_>"_u;
UString const Compiler::SYMBOL_WB_NO_SPACE = "<$->"_u;

void
Compiler::parse(string const &fichero, UString const &dir)
Expand All @@ -47,9 +54,13 @@ Compiler::parse(string const &fichero, UString const &dir)
alphabet.includeSymbol(Transducer::ANY_TAG_SYMBOL);
alphabet.includeSymbol(Transducer::ANY_CHAR_SYMBOL);
alphabet.includeSymbol(Transducer::LSX_BOUNDARY_SYMBOL);
any_tag = alphabet(Transducer::ANY_TAG_SYMBOL);
any_char = alphabet(Transducer::ANY_CHAR_SYMBOL);
word_boundary = alphabet(Transducer::LSX_BOUNDARY_SYMBOL);
alphabet.includeSymbol(Compiler::SYMBOL_WB_SPACE);
alphabet.includeSymbol(Compiler::SYMBOL_WB_NO_SPACE);
any_tag = alphabet(Transducer::ANY_TAG_SYMBOL);
any_char = alphabet(Transducer::ANY_CHAR_SYMBOL);
word_boundary = alphabet(Transducer::LSX_BOUNDARY_SYMBOL);
word_boundary_s = alphabet(Compiler::SYMBOL_WB_SPACE);
word_boundary_ns = alphabet(Compiler::SYMBOL_WB_NO_SPACE);

int ret = xmlTextReaderRead(reader);
while(ret == 1)
Expand Down Expand Up @@ -111,9 +122,7 @@ Compiler::procAlphabet()
}
else
{
cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader);
cerr << "): Missing alphabet symbols." << endl;
exit(EXIT_FAILURE);
error("Missing alphabet symbols.");
}
}
}
Expand Down Expand Up @@ -234,9 +243,7 @@ Compiler::requireEmptyError(UString const &name)
{
if(!xmlTextReaderIsEmptyElement(reader))
{
cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader);
cerr << "): Non-empty element '<" << name << ">' should be empty." << endl;
exit(EXIT_FAILURE);
error("Non-empty element '<%S>' should be empty.", name.c_str());
}
}

Expand Down Expand Up @@ -288,12 +295,15 @@ Compiler::readString(vector<int> &result, UString const &name)

if(!alphabet.isSymbolDefined(symbol))
{
cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader);
cerr << "): Undefined symbol '" << symbol << "'." << endl;
exit(EXIT_FAILURE);
error("Undefined symbol '%S'.", symbol.c_str());
}
result.push_back(alphabet(symbol));
}
else if(name == COMPILER_JOIN_ELEM)
{
requireEmptyError(name);
result.push_back(static_cast<int>('+'));
}
else if(name == COMPILER_ANYTAG_ELEM)
{
result.push_back(any_tag);
Expand All @@ -305,15 +315,19 @@ Compiler::readString(vector<int> &result, UString const &name)
else if(name == COMPILER_WB_ELEM)
{
requireEmptyError(name);
result.push_back(word_boundary);
UString mode = attrib(COMPILER_SPACE_ATTR);
if (mode == COMPILER_SPACE_YES_VAL) {
result.push_back(word_boundary_s);
} else if (mode == COMPILER_SPACE_NO_VAL) {
result.push_back(word_boundary_ns);
} else {
result.push_back(word_boundary);
}
}

else
{
cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader);
cerr << "): Invalid specification of element '<" << name;
cerr << ">' in this context." << endl;
exit(EXIT_FAILURE);
error("Invalid specification of element '<%S>' in this context.", name.c_str());
}
}

Expand All @@ -326,9 +340,7 @@ Compiler::skipBlanks(UString &name)
{
if(!allBlanks())
{
cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader);
cerr << "): Invalid construction." << endl;
exit(EXIT_FAILURE);
error("Invalid construction.");
}
}

Expand Down Expand Up @@ -361,9 +373,7 @@ Compiler::skip(UString &name, UString const &elem, bool open)
{
if(!allBlanks())
{
cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader);
cerr << "): Invalid construction." << endl;
exit(EXIT_FAILURE);
error("Invalid construction.");
}
}
xmlTextReaderRead(reader);
Expand All @@ -372,9 +382,7 @@ Compiler::skip(UString &name, UString const &elem, bool open)

if(name != elem)
{
cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader);
cerr << "): Expected '<" << slash << elem << ">'." << endl;
exit(EXIT_FAILURE);
error("Expected '<%S%S>'.", slash.c_str(), elem.c_str());
}
}

Expand Down Expand Up @@ -480,16 +488,12 @@ Compiler::procPar()

if(!current_paradigm.empty() && nomparadigma == current_paradigm)
{
cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader);
cerr << "): Paradigm refers to itself '" << nomparadigma << "'." <<endl;
exit(EXIT_FAILURE);
error("Paradigm '%S' refers to itself.", nomparadigma.c_str());
}

if(paradigms.find(nomparadigma) == paradigms.end())
{
cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader);
cerr << "): Undefined paradigm '" << nomparadigma << "'." << endl;
exit(EXIT_FAILURE);
error("Reference to undefined paradigm '%S'.", nomparadigma.c_str());
}
e.setParadigm(nomparadigma);
return e;
Expand Down Expand Up @@ -522,9 +526,7 @@ Compiler::insertEntryTokens(vector<EntryToken> const &elements)
}
else
{
cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader);
cerr << "): Invalid entry token." << endl;
exit(EXIT_FAILURE);
error("Invalid entry token.");
}
}
t.setFinal(e);
Expand Down Expand Up @@ -597,12 +599,9 @@ Compiler::requireAttribute(UString const &value, UString const &attrname,
UString const &elemname)
{
if(value.empty()) {
cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader);
cerr << "): '<" << elemname;
cerr << "' element must specify non-void '";
cerr << attrname << "' attribute." << endl;
exit(EXIT_FAILURE);
}
error("Element '<%S>' must specify a non-void value for attribute '%S'.",
elemname.c_str(), attrname.c_str());
}
}


Expand Down Expand Up @@ -665,9 +664,7 @@ Compiler::procEntry()
int ret = xmlTextReaderRead(reader);
if(ret != 1)
{
cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader);
cerr << "): Parse error." << endl;
exit(EXIT_FAILURE);
error("Parse error.");
}
UString name = XMLParseUtil::readName(reader);
skipBlanks(name);
Expand Down Expand Up @@ -700,9 +697,7 @@ Compiler::procEntry()

if(paradigms.find(p) == paradigms.end())
{
cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader);
cerr << "): Undefined paradigm '" << p << "'." <<endl;
exit(EXIT_FAILURE);
error("Undefined paradigm '%S'.", p.c_str());
}
// descartar entradas con paradigms vac���os (por las direciones,
// normalmente
Expand Down Expand Up @@ -732,10 +727,8 @@ Compiler::procEntry()
}
else
{
cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader);
cerr << "): Invalid inclusion of '<" << name << ">' into '<" << COMPILER_ENTRY_ELEM;
cerr << ">'." << endl;
exit(EXIT_FAILURE);
error("Invalid inclusion of '<%S>' in '<%S>'.", name.c_str(),
COMPILER_ENTRY_ELEM.c_str());
}

}
Expand Down Expand Up @@ -790,9 +783,7 @@ Compiler::procNode()
}
else
{
cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader);
cerr << "): Invalid node '<" << nombre << ">'." << endl;
exit(EXIT_FAILURE);
error("Invalid node '<%S>'.", nombre.c_str());
}
}

Expand Down
16 changes: 16 additions & 0 deletions src/lsx_compiler.h
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,8 @@ class Compiler
int32_t any_tag = 0;
int32_t any_char = 0;
int32_t word_boundary = 0;
int32_t word_boundary_s = 0;
int32_t word_boundary_ns = 0;

/**
* List of named transducers-paradigms
Expand All @@ -129,6 +131,15 @@ class Compiler
*/
map<UString, map<UString, int>> postsuffix_paradigms;

template<typename... T>
void error(const char* fmt, T... args) {
UFILE* out = u_finit(stderr, NULL, NULL);
u_fprintf(out, "Error on line %d: ",
xmlTextReaderGetParserLineNumber(reader));
u_fprintf(out, fmt, args...);
u_fprintf(out, "\n");
exit(EXIT_FAILURE);
}

/*
static string range(char const a, char const b);
Expand Down Expand Up @@ -306,6 +317,11 @@ class Compiler
static UString const COMPILER_ANYTAG_ELEM;
static UString const COMPILER_ANYCHAR_ELEM;
static UString const COMPILER_WB_ELEM;
static UString const COMPILER_SPACE_ATTR;
static UString const COMPILER_SPACE_YES_VAL;
static UString const COMPILER_SPACE_NO_VAL;
static UString const SYMBOL_WB_SPACE;
static UString const SYMBOL_WB_NO_SPACE;


/**
Expand Down
79 changes: 48 additions & 31 deletions src/lsx_processor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,8 @@ LSXProcessor::load(FILE *input)
// symbols
alphabet.read(input);
word_boundary = alphabet("<$>"_u);
word_boundary_s = alphabet("<$_>"_u);
word_boundary_ns = alphabet("<$->"_u);
any_char = alphabet("<ANY_CHAR>"_u);
any_tag = alphabet("<ANY_TAG>"_u);

Expand Down Expand Up @@ -247,7 +249,7 @@ LSXProcessor::processWord(InputFile& input, UFILE* output)
s.step_override(lu[i], towlower(lu[i]), any_char, lu[i]);
}
}
s.step(word_boundary);
s.step(word_boundary, word_boundary_s, word_boundary_ns);
if(s.isFinal(all_finals))
{
last_final = idx+1;
Expand All @@ -270,23 +272,7 @@ LSXProcessor::processWord(InputFile& input, UFILE* output)
lu_queue.pop_front();
return;
}
vector<UString> out_lus;
size_t pos = 0;
while(pos != UString::npos && pos != last_final_out.size())
{
size_t start = pos;
pos = last_final_out.find("<$>"_u, start);
if(pos == UString::npos)
{
out_lus.push_back(last_final_out.substr(start));
}
else
{
out_lus.push_back(last_final_out.substr(start, pos-start));
pos += 3;
}
}


UString wblank;
for(size_t i = 0; i < last_final; i++)
{
Expand All @@ -308,30 +294,61 @@ LSXProcessor::processWord(InputFile& input, UFILE* output)
{
wblank += "]]"_u;
}

size_t i = 0;
for(; i < out_lus.size(); i++)

size_t output_count = 0;
size_t pos = 0;
bool pop_queue = true;
bool replace_empty = false;
while(pos != UString::npos && pos != last_final_out.size())
{
if(i < last_final)
if (pop_queue) {
if (output_count < last_final) {
write(blank_queue[output_count], output);
if (replace_empty && blank_queue[output_count].empty()) {
u_fputc(' ', output);
}
output_count++;
} else {
u_fputc(' ', output);
}
}
write(wblank, output);
u_fputc('^', output);
size_t start = pos;
pos = last_final_out.find("<$"_u, start);
if(pos == UString::npos)
{
write(blank_queue[i], output);
write(last_final_out.substr(start), output);
u_fputc('$', output);
break;
}
else
{
u_fputc(' ', output);
write(last_final_out.substr(start, pos-start), output);
u_fputc('$', output);
pos += 2;
if (last_final_out[pos] == '-') {
pop_queue = false;
pos++;
} else if (last_final_out[pos] == '_') {
pop_queue = true;
replace_empty = true;
pos++;
} else {
pop_queue = true;
replace_empty = false;
}
pos++;
}
write(wblank, output);
u_fputc('^', output);
write(out_lus[i], output);
u_fputc('$', output);
}
for(; i < last_final; i++)
for(; output_count < last_final; output_count++)
{
if(blank_queue[i] != " "_u)
if(blank_queue[output_count] != " "_u)
{
write(blank_queue[i], output);
write(blank_queue[output_count], output);
}
}

blank_queue.erase(blank_queue.begin(), blank_queue.begin()+last_final);
bound_blank_queue.erase(bound_blank_queue.begin(), bound_blank_queue.begin()+last_final);
lu_queue.erase(lu_queue.begin(), lu_queue.begin()+last_final);
Expand Down
2 changes: 2 additions & 0 deletions src/lsx_processor.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@ class LSXProcessor
void processWord(InputFile& input, UFILE* output);

int word_boundary;
int word_boundary_s;
int word_boundary_ns;
int any_char;
int any_tag;
public:
Expand Down
Loading

0 comments on commit 27b8b6a

Please sign in to comment.