Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closes #2460: Compatibility with libre office files #2472

Merged
merged 6 commits into from
May 16, 2024
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@ public enum IngestError {
CSV_LINE_MISMATCH,
CSV_RECORD_MISMATCH,
EXCEL_PARSE,
EXCEL_UNKNOWN_OR_INVALID_COLUMN_COUNT,
EXCEL_UNKNOWN_VARIABLE_NAME,
EXCEL_AMBIGUOUS_INDEX_POSITION,
EXCEL_NO_ROWS,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,10 @@
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.stream.Collectors;
Expand Down Expand Up @@ -122,12 +125,14 @@ public void processSheet(InputStream inputStream, DataTable dataTable, PrintWrit

XMLReader parser = fetchSheetParser(sst, dataTable, tempOut);

// rId2 found by processing the Workbook
// Seems to either be rId# or rSheet#
InputStream sheet1 = r.getSheet("rId1");
InputSource sheetSource = new InputSource(sheet1);
parser.parse(sheetSource);
sheet1.close();
Iterator<InputStream> sheetsData = r.getSheetsData();
// Reading only the first sheet
if (sheetsData.hasNext()) {
try (InputStream sheet1 = sheetsData.next()) {
InputSource sheetSource = new InputSource(sheet1);
parser.parse(sheetSource);
}
}
}

// -------------------- PRIVATE --------------------
Expand All @@ -143,7 +148,7 @@ private TabularDataIngest getTabularDataIngest(BufferedInputStream stream, File
throw ie;
} catch (Exception ex) {
logger.log(Level.FINE, "Could not parse Excel/XLSX spreadsheet.", ex);
throw new IngestException(IngestError.EXCEL_PARSE);
throw new IngestException(IngestError.EXCEL_PARSE, ex);
}

if (dataTable.getCaseQuantity() == null || dataTable.getCaseQuantity().intValue() < 1) {
Expand Down Expand Up @@ -257,14 +262,14 @@ private TabularDataIngest getTabularDataIngest(BufferedInputStream stream, File

private static class SheetHandler extends DefaultHandler {

private final Map<Integer, String> variableNames = new TreeMap<>();
private DataTable dataTable;
private SharedStringsTable sst;
private String cellContents;
private boolean nextIsString;
private boolean variableHeader;
private String[] variableNames;
private int caseCount;
private int columnCount;
private int columnIdx;
private boolean[] isNumericVariable;
private String[] dataRow;
private PrintWriter tempOut;
Expand All @@ -281,7 +286,7 @@ private SheetHandler(SharedStringsTable sst, DataTable dataTable, PrintWriter te
this.tempOut = tempOut;
variableHeader = true;
caseCount = 0;
columnCount = 0;
columnIdx = 0;
}

// -------------------- LOGIC --------------------
Expand All @@ -291,36 +296,12 @@ public void startElement(String uri, String localName, String name, Attributes a

// first raw encountered:
if (variableHeader && "row".equals(name)) {
Long varCount;
String rAttribute = attributes.getValue("t");
if (rAttribute == null) {
logger.warning("Null r attribute in the first row element!");
} else if (!rAttribute.equals("1")) {
logger.warning("Attribute r of the first row element is not \"1\"!");
}

String spansAttribute = attributes.getValue("spans");
if (spansAttribute == null) {
logger.warning("Null spans attribute in the first row element!");
}
int colIndex = spansAttribute.indexOf(':');
if (colIndex < 1 || (colIndex == spansAttribute.length() - 1)) {
logger.warning("Invalid spans attribute in the first row element: " + spansAttribute + "!");
}
try {
varCount = new Long(spansAttribute.substring(colIndex + 1));
} catch (Exception ex) {
varCount = null;
}

if (varCount == null || varCount.intValue() < 1) {
throw new IngestException(IngestError.EXCEL_UNKNOWN_OR_INVALID_COLUMN_COUNT);
Integer varCount = getColCount(attributes);
if (varCount != null) {
for (int i = 0; i < varCount; i++) {
variableNames.put(i, StringUtils.EMPTY);
}
}

logger.info("Established variable (column) count: " + varCount);

dataTable.setVarQuantity(varCount);
variableNames = new String[varCount.intValue()];
}

// c => cell
Expand All @@ -336,9 +317,9 @@ public void startElement(String uri, String localName, String name, Attributes a
if (!indexAttribute.matches(".*[0-9]")) {
logger.warning("Invalid index (r) attribute in a cell element: " + indexAttribute + "!");
}
columnCount = converter.columnToIndex(indexAttribute.replaceFirst("[0-9].*$", ""));
columnIdx = converter.columnToIndex(indexAttribute.replaceFirst("[0-9].*$", ""));

if (columnCount < 0) {
if (columnIdx < 0) {
throw new IngestException(IngestError.EXCEL_AMBIGUOUS_INDEX_POSITION);
}

Expand All @@ -363,13 +344,12 @@ public void endElement(String uri, String localName, String name) {
// Output after we've seen the string contents
if ("v".equals(name)) {
if (variableHeader) {
logger.fine("variable header mode; cell " + columnCount + ", cell contents: " + cellContents);
logger.fine("variable header mode; cell " + columnIdx + ", cell contents: " + cellContents);

//variableNames.add(cellContents);
variableNames[columnCount] = cellContents;
variableNames.put(columnIdx, cellContents);
} else {
dataRow[columnCount] = cellContents;
logger.fine("data row mode; cell " + columnCount + ", cell contents: " + cellContents);
dataRow[columnIdx] = cellContents;
logger.fine("data row mode; cell " + columnIdx + ", cell contents: " + cellContents);
}
}

Expand All @@ -378,15 +358,17 @@ public void endElement(String uri, String localName, String name) {
// Initialize variables:
logger.fine("variableHeader mode; ");
List<DataVariable> variableList = new ArrayList<DataVariable>();
//columnCount = variableNames.size();
columnCount = dataTable.getVarQuantity().intValue();
int columnCount = variableNames.size();
logger.info("Established variable (column) count: " + columnCount);
dataTable.setVarQuantity((long) columnCount);

for (int i = 0; i < columnCount; i++) {
String varName = variableNames[i];
for (int i = 0; i < variableNames.size(); i++) {
String varName = variableNames.get(i);

if (varName == null || varName.equals("")) {
if (varName == null || varName.equals(StringUtils.EMPTY)) {
varName = converter.indexToColumn(i);
}

if (varName == null) {
throw new IngestException(IngestError.EXCEL_UNKNOWN_VARIABLE_NAME, String.valueOf(i));
}
Expand Down Expand Up @@ -451,7 +433,7 @@ public void endElement(String uri, String localName, String name) {
tempOut.println(StringUtils.join(dataRow, "\t"));
caseCount++;
}
columnCount = 0;
columnIdx = 0;
dataRow = new String[dataTable.getVarQuantity().intValue()];
}

Expand All @@ -473,5 +455,26 @@ public void endElement(String uri, String localName, String name) {
public void characters(char[] ch, int start, int length) {
cellContents += new String(ch, start, length);
}

private static Integer getColCount(Attributes attributes) {
String spansAttribute = attributes.getValue("spans");
if (spansAttribute == null) {
logger.warning("Null spans attribute in the first row element!");
return null;
}

int colIndex = spansAttribute.indexOf(':');
if (colIndex < 1 || (colIndex == spansAttribute.length() - 1)) {
logger.warning("Invalid spans attribute in the first row element: " + spansAttribute + "!");
return null;
}

try {
return Integer.parseInt(spansAttribute.substring(colIndex + 1));
} catch (Exception ex) {
return null;
}
}

}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
package edu.harvard.iq.dataverse.ingest.tabulardata.impl.plugins.xlsx;

import edu.harvard.iq.dataverse.ingest.tabulardata.TabularDataIngest;
import edu.harvard.iq.dataverse.ingest.tabulardata.impl.plugins.csv.CSVFileReaderTest;
import edu.harvard.iq.dataverse.persistence.datafile.datavariable.DataVariable;
import io.vavr.Tuple;
import org.junit.jupiter.api.Assumptions;
import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.CsvSource;
import org.junit.jupiter.params.provider.ValueSource;

import java.io.BufferedInputStream;
import java.io.File;
import java.io.IOException;
import java.net.URISyntaxException;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.stream.Collectors;

import static org.assertj.core.api.Assertions.assertThat;

public class XLSXFileReaderTest {

@ParameterizedTest
@ValueSource(strings = { "xslx/table-google.xlsx", "xslx/table-libre.xlsx", "xslx/table-excel.xlsx" })
void read__various_sources(String xlsxFile) throws Exception {
// when
TabularDataIngest result = read(xlsxFile);

// then
assertThat(result.getDataTable().getVarQuantity()).isEqualTo(5);
assertThat(result.getDataTable().getDataVariables().stream().map(DataVariable::getName).collect(Collectors.toList()))
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not a change request.
We could have also write it in a little shorter way:
`assertThat(result.getDataTable().getDataVariables()).extracting(DataVariable::getName)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh yeah, that's much better. Changed it.

.containsExactly(
"Id", "Item", "cost", "count", "total");
assertThat(Files.readAllLines(result.getTabDelimitedFile().toPath()))
.containsExactly(
"1.0\t\"Banana\"\t2.3\t4.0\t9.2",
"2.0\t\"Choco\"\t8.49\t2.0\t16.98",
"3.0\t\"Headset\"\t248.99\t1.0\t248.99");
}

@ParameterizedTest
@CsvSource({
"xslx/missing-columns-libre.xlsx,false", // disabled, because unsupported by the current implementation
"xslx/missing-columns-excel.xlsx,true" })
void read__missing_columns(String xlsxFile, boolean enabled) throws Exception {
Assumptions.assumeTrue(enabled, "Test file " + xlsxFile + " is disabled.");

// when
TabularDataIngest result = read(xlsxFile);

// then
assertThat(result.getDataTable().getVarQuantity()).isEqualTo(8);
assertThat(result.getDataTable().getDataVariables().stream().map(DataVariable::getName).collect(Collectors.toList()))
.containsExactly("A", "Col1", "Col2", "D", "Col4", "Col5", "G", "Col7");
assertThat(Files.readAllLines(result.getTabDelimitedFile().toPath()))
.containsExactly(
"\"Row1\"\t1.1\t1.2\t1.3\t1.4\t1.5\t1.6\t1.7",
"\"Row2\"\t2.1\t2.2\t2.3\t2.4\t2.5\t2.6\t2.7",
"\"Row3\"\t3.1\t3.2\t3.3\t3.4\t3.5\t3.6\t3.7",
"\"\"\t4.1\t4.2\t4.3\t4.4\t4.5\t4.6\t4.7",
"\"Row5\"\t5.1\t5.2\t5.3\t5.4\t6.5\t5.6\t5.7");
}

@ParameterizedTest
@CsvSource({ "xslx/value-types-libre.xlsx", "xslx/value-types-excel.xlsx" })
void read__value_types(String xlsxFile) throws Exception {
// when
TabularDataIngest result = read(xlsxFile);

// then
assertThat(result.getDataTable().getVarQuantity()).isEqualTo(4);
assertThat(result.getDataTable().getDataVariables().stream().map(DataVariable::getName).collect(Collectors.toList()))
.containsExactly("A", "B", "Total", "Div");
assertThat(result.getDataTable().getDataVariables().stream().map(DataVariable::getType).collect(Collectors.toList()))
.containsExactly(
DataVariable.VariableType.CHARACTER,
DataVariable.VariableType.NUMERIC,
DataVariable.VariableType.CHARACTER,
DataVariable.VariableType.CHARACTER);
assertThat(Files.readAllLines(result.getTabDelimitedFile().toPath()))
.containsExactly(
"\"1\"\t1.0\t\"1\"\t\"1\"",
"\"2\"\t4.0\t\"8\"\t\"0.5\"",
"\"A\"\t4.0\t\"#VALUE!\"\t\"#VALUE!\"",
"\"1\"\t0.0\t\"0\"\t\"#DIV/0!\"");
}

private TabularDataIngest read(String xlsxFile) throws IOException {
try {
File file = Paths.get(CSVFileReaderTest.class.getClassLoader().getResource(xlsxFile).toURI()).toFile();
TabularDataIngest result;
try (BufferedInputStream is = new BufferedInputStream(Files.newInputStream(file.toPath()))) {
XLSXFileReader reader = new XLSXFileReader(new XLSXFileReaderSpi());
result = reader.read(Tuple.of(is, file), null);
}
return result;
} catch (URISyntaxException use) {
throw new RuntimeException(use);
}
}
}
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Loading