Skip to content

Commit

Permalink
feat(schema): enable ControlledVocabularyValue data-binding from TSV I…
Browse files Browse the repository at this point in the history
…QSS#8085

- Add headers enum like with MetadataBlock and DatasetFieldType
- Add a placeholder for alternate values that need references to
  the dataset field this CVV is a part of
- Make the alternatives come from a column with a header
  (This is undocumented behaviour in the docs currently!)
- Proper validation as with the other data bindings
- Includes lot's of tests to make sure we notice when it breaks.
  • Loading branch information
poikilotherm committed Sep 7, 2021
1 parent 16bf03c commit 336825d
Show file tree
Hide file tree
Showing 3 changed files with 250 additions and 15 deletions.
Original file line number Diff line number Diff line change
@@ -1,22 +1,21 @@
/*
* To change this license header, choose License Headers in Project Properties.
* To change this template file, choose Tools | Templates
* and open the template in the editor.
*/

package edu.harvard.iq.dataverse;

import com.univocity.parsers.annotations.Parsed;
import com.univocity.parsers.annotations.Validate;
import edu.harvard.iq.dataverse.util.BundleUtil;
import edu.harvard.iq.dataverse.util.metadata.Placeholder;
import org.apache.commons.lang3.StringUtils;

import java.io.Serializable;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Comparator;
import java.util.Locale;
import java.util.Objects;
import java.util.logging.Logger;
import java.util.MissingResourceException;
import java.util.stream.Collectors;
import javax.persistence.CascadeType;
import javax.persistence.Column;
import javax.persistence.Entity;
Expand All @@ -38,11 +37,41 @@ public class ControlledVocabularyValue implements Serializable {

private static final Logger logger = Logger.getLogger(ControlledVocabularyValue.class.getCanonicalName());

public static final Comparator<ControlledVocabularyValue> DisplayOrder = new Comparator<ControlledVocabularyValue>() {
@Override
public int compare(ControlledVocabularyValue o1, ControlledVocabularyValue o2) {
return Integer.compare( o1.getDisplayOrder(), o2.getDisplayOrder() );
}};
/**
* Identifiers are used to match either URLs (Term), URIs (PID) or string containing only A-Z, a-z, 0-9, _, + and -
* (If no identifier is set, the value will be used, so it may contain spaces in the end. But IF you provide
* an identifier, you do it for good reasons. Any real identifiers out there don't contain whitespace for a reason)
*/
public static final String IDENTIFIER_MATCH_REGEX = "^(\\w+:(\\/\\/)?[\\w\\-+&@#/%?=~|!:,.;]*[\\w\\-+&@#/%=~|]|[\\w\\-\\+]+)$";
public static final Comparator<ControlledVocabularyValue> DisplayOrder = Comparator.comparingInt(ControlledVocabularyValue::getDisplayOrder);

public enum Headers {
DATASET_FIELD(Constants.DATASET_FIELD),
VALUE(Constants.VALUE),
IDENTIFIER(Constants.IDENTIFIER),
DISPLAY_ORDER(Constants.DISPLAY_ORDER),
ALT_VALUES(Constants.ALT_VALUES);

public static final class Constants {
public final static String DATASET_FIELD = "DatasetField";
public final static String VALUE = "Value";
public final static String IDENTIFIER = "identifier";
public final static String DISPLAY_ORDER = "displayOrder";
public final static String ALT_VALUES = "altValue";
}

private final String key;
Headers(String key) {
this.key = key;
}
public String key() {
return this.key;
}

public static String[] keys() {
return Arrays.stream(values()).map(Headers::key).collect(Collectors.toUnmodifiableList()).toArray(new String[]{});
}
}

public ControlledVocabularyValue() {
}
Expand Down Expand Up @@ -71,9 +100,11 @@ public void setId(Long id) {
public String getStrValue() {
return strValue;
}

@Parsed(field = Headers.Constants.VALUE)
@Validate
public void setStrValue(String strValue) {
this.strValue = strValue;

}

private String identifier;
Expand All @@ -82,15 +113,29 @@ public String getIdentifier() {
return identifier;
}

@Parsed(field = Headers.Constants.IDENTIFIER)
@Validate(nullable = true, matches = IDENTIFIER_MATCH_REGEX)
public void setIdentifier(String identifier) {
this.identifier = identifier;
}



private int displayOrder;
public int getDisplayOrder() { return this.displayOrder;}
public void setDisplayOrder(int displayOrder) {this.displayOrder = displayOrder;}
public int getDisplayOrder() {
return this.displayOrder;
}
public void setDisplayOrder(int displayOrder) {
this.displayOrder = displayOrder;
}
/**
* Set display order value from String. Allow only positive integers >= 0.
* @param displayOrder
*/
@Parsed(field = Headers.Constants.DISPLAY_ORDER)
@Validate(matches = "^\\d+$")
public void setDisplayOrder(String displayOrder) {
this.displayOrder = Integer.parseInt(displayOrder);
}


@ManyToOne
Expand All @@ -102,6 +147,13 @@ public DatasetFieldType getDatasetFieldType() {
public void setDatasetFieldType(DatasetFieldType datasetFieldType) {
this.datasetFieldType = datasetFieldType;
}

@Parsed(field = Headers.Constants.DATASET_FIELD)
@Validate(matches = DatasetFieldType.FIELD_NAME_REGEX)
private void setDatasetFieldType(String datasetFieldType) {
this.datasetFieldType = new Placeholder.DatasetFieldType();
this.datasetFieldType.setName(datasetFieldType);
}

@OneToMany(mappedBy = "controlledVocabularyValue", cascade = {CascadeType.REMOVE, CascadeType.MERGE, CascadeType.PERSIST}, orphanRemoval=true)
private Collection<ControlledVocabAlternate> controlledVocabAlternates = new ArrayList<>();
Expand All @@ -113,6 +165,23 @@ public Collection<ControlledVocabAlternate> getControlledVocabAlternates() {
public void setControlledVocabAlternates(Collection<ControlledVocabAlternate> controlledVocabAlternates) {
this.controlledVocabAlternates = controlledVocabAlternates;
}

/**
* A hacky workaround to allow arbitrary numbers of "altValue" columns in the TSV file, providing
* alternative values for the controlled vocabulary value.
* @param alternative
*/
@Parsed(field = Headers.Constants.ALT_VALUES)
@Validate(nullable = true, allowBlanks = true)
private void addControlledVocabAlternates(String alternative) {
if (alternative == null || alternative.isBlank()) {
return;
}
ControlledVocabAlternate alt = new Placeholder.ControlledVocabAlternate();
alt.setControlledVocabularyValue(this);
alt.setStrValue(alternative);
this.controlledVocabAlternates.add(alt);
}

public String getLocaleStrValue() {
return getLocaleStrValue(null);
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
package edu.harvard.iq.dataverse.util.metadata;

import edu.harvard.iq.dataverse.ControlledVocabAlternate;

/**
* This class provides some simple markers, so we con distinguish if we need to replace a placeholder with
* a real object from the database/... when handing over after parsing
Expand All @@ -8,4 +10,5 @@ public class Placeholder {
public static final class Dataverse extends edu.harvard.iq.dataverse.Dataverse {}
public static final class MetadataBlock extends edu.harvard.iq.dataverse.MetadataBlock {}
public static final class DatasetFieldType extends edu.harvard.iq.dataverse.DatasetFieldType {}
public static final class ControlledVocabAlternate extends edu.harvard.iq.dataverse.ControlledVocabAlternate {}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,163 @@
package edu.harvard.iq.dataverse.util.metadata;

import com.univocity.parsers.common.DataValidationException;
import com.univocity.parsers.common.processor.BeanListProcessor;
import com.univocity.parsers.tsv.TsvParser;
import com.univocity.parsers.tsv.TsvParserSettings;
import edu.harvard.iq.dataverse.ControlledVocabularyValue;
import edu.harvard.iq.dataverse.DatasetFieldType;
import edu.harvard.iq.dataverse.MetadataBlock;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.EmptySource;
import org.junit.jupiter.params.provider.ValueSource;

import java.io.StringReader;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;

import static org.junit.jupiter.api.Assertions.*;

class ControlledVocabularyValueTest {

static BeanListProcessor<ControlledVocabularyValue> controlledVocabularyValueProcessor = new BeanListProcessor<>(ControlledVocabularyValue.class);
static TsvParser parser;
static TsvParserSettings settings = new TsvParserSettings();

@BeforeAll
static void setUp() {
settings.setProcessor(controlledVocabularyValueProcessor);
settings.setHeaderExtractionEnabled(true);
// TODO: replace this char with a global constant (introduced when creating the parsing bean)
settings.getFormat().setComment('\'');
parser = new TsvParser(settings);
}

@ParameterizedTest
@EmptySource
@ValueSource(strings = {" "})
public void parseInvalidValue(String value) {
// given
StringReader subjectUnderTest = new StringReader(generateCvvTSV(Map.of(
ControlledVocabularyValue.Headers.DATASET_FIELD, "test",
ControlledVocabularyValue.Headers.VALUE, value,
ControlledVocabularyValue.Headers.DISPLAY_ORDER, "0")));
// when & then
assertThrows(DataValidationException.class, () -> parser.parse(subjectUnderTest));
}

@ParameterizedTest
@ValueSource(strings = {"https://www^^", "doi:1234^", "hello my name", "hello!", "hello#"})
public void parseInvalidIdentifier(String identifier) {
// given
StringReader subjectUnderTest = new StringReader(generateCvvTSV(Map.of(
ControlledVocabularyValue.Headers.DATASET_FIELD, "test",
ControlledVocabularyValue.Headers.VALUE, "test",
ControlledVocabularyValue.Headers.DISPLAY_ORDER, "0",
ControlledVocabularyValue.Headers.IDENTIFIER, identifier)));
// when & then
assertThrows(DataValidationException.class, () -> parser.parse(subjectUnderTest));
}

@ParameterizedTest
@EmptySource
@ValueSource(strings = {"https://skosmos/foo#bar", "doi:1234", "hello_my-name", "foo+bar"})
public void parseValidIdentifier(String identifier) {
// given
StringReader subjectUnderTest = new StringReader(generateCvvTSV(Map.of(
ControlledVocabularyValue.Headers.DATASET_FIELD, "test",
ControlledVocabularyValue.Headers.VALUE, "test",
ControlledVocabularyValue.Headers.DISPLAY_ORDER, "0",
ControlledVocabularyValue.Headers.IDENTIFIER, identifier)));
// when
parser.parse(subjectUnderTest);
// then
assertEquals(1, controlledVocabularyValueProcessor.getBeans().size());
if (!identifier.isEmpty()) {
assertEquals(identifier, controlledVocabularyValueProcessor.getBeans().get(0).getIdentifier());
} else {
assertNull(controlledVocabularyValueProcessor.getBeans().get(0).getIdentifier());
}
}

@ParameterizedTest
@ValueSource(strings = {"-1", "-100", "0.00", "abc", "_foobar!"})
public void parseInvalidDisplayOrder(String displayOrder) {
// given
StringReader subjectUnderTest = new StringReader(generateCvvTSV(Map.of(
ControlledVocabularyValue.Headers.DATASET_FIELD, "test",
ControlledVocabularyValue.Headers.VALUE, "test",
ControlledVocabularyValue.Headers.DISPLAY_ORDER, displayOrder)));
// when & then
assertThrows(DataValidationException.class, () -> parser.parse(subjectUnderTest));
}

@ParameterizedTest
@ValueSource(strings = {".foobar", "!foo", "foo!", "_foo_", "-foo-foo", "foo.", "_foo.foo_", "1foo", ".bar"})
public void parseInvalidDatasetFieldTypeName(String fieldName) {
// given
StringReader subjectUnderTest = new StringReader(generateCvvTSV(Map.of(
ControlledVocabularyValue.Headers.DATASET_FIELD, fieldName,
ControlledVocabularyValue.Headers.VALUE, "test",
ControlledVocabularyValue.Headers.DISPLAY_ORDER, "0")));
// when & then
assertThrows(DataValidationException.class, () -> parser.parse(subjectUnderTest));
}

@Test
public void parseAlternateValues() {
// given
String t1 = "test1";
String t2 = "test2";
String tsv = generateCvvTSV(Map.of(
ControlledVocabularyValue.Headers.DATASET_FIELD, "test",
ControlledVocabularyValue.Headers.VALUE, "test",
ControlledVocabularyValue.Headers.DISPLAY_ORDER, "0"),
List.of(t1, t2));
StringReader subjectUnderTest = new StringReader(tsv);
// when
parser.parse(subjectUnderTest);
// then
assertEquals(1, controlledVocabularyValueProcessor.getBeans().size());

ControlledVocabularyValue result = controlledVocabularyValueProcessor.getBeans().get(0);

assertFalse(result.getControlledVocabAlternates().isEmpty());
assertTrue(result.getControlledVocabAlternates().stream().allMatch(a -> a instanceof Placeholder.ControlledVocabAlternate));

assertEquals(2, result.getControlledVocabAlternates().size());
assertTrue(result.getControlledVocabAlternates().stream().anyMatch(a -> t1.equals(a.getStrValue())));
assertTrue(result.getControlledVocabAlternates().stream().anyMatch(a -> t2.equals(a.getStrValue())));
}

private static final String header = "#controlledVocabulary\t" + String.join("\t", ControlledVocabularyValue.Headers.keys());

/**
* This method simply inserts all the values from the map into a line, combined by \t and adds a "header" line before it.
* It does this based on the {@link MetadataBlock.Headers} enum value order, which is the same as in the TSV definition.
* Nonpresent values will be inserted as blank strings.
*
* @param values
* @return
*/
public static String generateCvvTSV(Map<ControlledVocabularyValue.Headers, String> values) {
List<String> fieldValues = Arrays.stream(ControlledVocabularyValue.Headers.values())
.map(k -> values.getOrDefault(k, ""))
.collect(Collectors.toList());
return header + settings.getFormat().getLineSeparatorString() + "\t" + String.join("\t", fieldValues);
}

public static String generateCvvTSV(Map<ControlledVocabularyValue.Headers, String> values, List<String> altValues) {
List<String> fieldValues = Arrays.stream(ControlledVocabularyValue.Headers.values())
.map(k -> values.getOrDefault(k, ""))
.collect(Collectors.toList());

String headerWithAlts = header + ("\t"+ControlledVocabularyValue.Headers.Constants.ALT_VALUES).repeat(altValues.size()-1);

return headerWithAlts + settings.getFormat().getLineSeparatorString() +
"\t" + String.join("\t", fieldValues) + String.join("\t", altValues);
}
}

0 comments on commit 336825d

Please sign in to comment.