Skip to content

Commit 96b2623

Browse files
author
David Roberts
committed
[ML] Rename the json file structure to ndjson (#34901)
The file structure finder endpoint can find the NDJSON (newline-delimited JSON) file format, but called it `json`. This change renames the `format` for this file structure to `ndjson`, which is more precise and will hopefully avoid confusion.
1 parent 350b889 commit 96b2623

File tree

15 files changed

+62
-62
lines changed

15 files changed

+62
-62
lines changed

docs/reference/ml/apis/find-file-structure.asciidoc

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,7 @@ chosen.
7474
structure finder produced its result. The default value is `false`.
7575

7676
`format`::
77-
(string) The high level structure of the file. Valid values are `json`, `xml`,
77+
(string) The high level structure of the file. Valid values are `ndjson`, `xml`,
7878
`delimited`, and `semi_structured_text`. If this parameter is not specified,
7979
the structure finder chooses one.
8080

@@ -259,7 +259,7 @@ If the request does not encounter errors, you receive the following result:
259259
"sample_start" : "{\"name\": \"Leviathan Wakes\", \"author\": \"James S.A. Corey\", \"release_date\": \"2011-06-02\", \"page_count\": 561}\n{\"name\": \"Hyperion\", \"author\": \"Dan Simmons\", \"release_date\": \"1989-05-26\", \"page_count\": 482}\n", <3>
260260
"charset" : "UTF-8", <4>
261261
"has_byte_order_marker" : false, <5>
262-
"format" : "json", <6>
262+
"format" : "ndjson", <6>
263263
"need_client_timezone" : false, <7>
264264
"mappings" : { <8>
265265
"author" : {
@@ -473,14 +473,14 @@ If the request does not encounter errors, you receive the following result:
473473

474474
<1> `num_lines_analyzed` indicates how many lines of the file were analyzed.
475475
<2> `num_messages_analyzed` indicates how many distinct messages the lines contained.
476-
For ND-JSON, this value is the same as `num_lines_analyzed`. For other file
476+
For NDJSON, this value is the same as `num_lines_analyzed`. For other file
477477
formats, messages can span several lines.
478478
<3> `sample_start` reproduces the first two messages in the file verbatim. This
479479
may help to diagnose parse errors or accidental uploads of the wrong file.
480480
<4> `charset` indicates the character encoding used to parse the file.
481481
<5> For UTF character encodings, `has_byte_order_marker` indicates whether the
482482
file begins with a byte order marker.
483-
<6> `format` is one of `json`, `xml`, `delimited` or `semi_structured_text`.
483+
<6> `format` is one of `ndjson`, `xml`, `delimited` or `semi_structured_text`.
484484
<7> If a timestamp format is detected that does not include a timezone,
485485
`need_client_timezone` will be `true`. The server that parses the file must
486486
therefore be told the correct timezone by the client.

x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/filestructurefinder/FileStructure.java

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -32,11 +32,11 @@ public class FileStructure implements ToXContentObject, Writeable {
3232

3333
public enum Format {
3434

35-
JSON, XML, DELIMITED, SEMI_STRUCTURED_TEXT;
35+
NDJSON, XML, DELIMITED, SEMI_STRUCTURED_TEXT;
3636

3737
public boolean supportsNesting() {
3838
switch (this) {
39-
case JSON:
39+
case NDJSON:
4040
case XML:
4141
return true;
4242
case DELIMITED:
@@ -49,7 +49,7 @@ public boolean supportsNesting() {
4949

5050
public boolean isStructured() {
5151
switch (this) {
52-
case JSON:
52+
case NDJSON:
5353
case XML:
5454
case DELIMITED:
5555
return true;
@@ -62,7 +62,7 @@ public boolean isStructured() {
6262

6363
public boolean isSemiStructured() {
6464
switch (this) {
65-
case JSON:
65+
case NDJSON:
6666
case XML:
6767
case DELIMITED:
6868
return false;
@@ -645,7 +645,7 @@ public FileStructure build() {
645645
}
646646

647647
switch (format) {
648-
case JSON:
648+
case NDJSON:
649649
if (shouldTrimFields != null) {
650650
throw new IllegalArgumentException("Should trim fields may not be specified for [" + format + "] structures.");
651651
}

x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/action/FindFileStructureActionRequestTests.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -124,7 +124,7 @@ public void testValidateNonDelimited() {
124124
public void testValidateNonSemiStructuredText() {
125125

126126
FindFileStructureAction.Request request = new FindFileStructureAction.Request();
127-
request.setFormat(randomFrom(FileStructure.Format.JSON, FileStructure.Format.XML, FileStructure.Format.DELIMITED));
127+
request.setFormat(randomFrom(FileStructure.Format.NDJSON, FileStructure.Format.XML, FileStructure.Format.DELIMITED));
128128
request.setGrokPattern(randomAlphaOfLength(80));
129129
request.setSample(new BytesArray("foo\n"));
130130

x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureFinderManager.java

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@
3535
* Runs the high-level steps needed to create ingest configs for the specified file. In order:
3636
* 1. Determine the most likely character set (UTF-8, UTF-16LE, ISO-8859-2, etc.)
3737
* 2. Load a sample of the file, consisting of the first 1000 lines of the file
38-
* 3. Determine the most likely file structure - one of ND-JSON, XML, delimited or semi-structured text
38+
* 3. Determine the most likely file structure - one of NDJSON, XML, delimited or semi-structured text
3939
* 4. Create an appropriate structure object and delegate writing configs to it
4040
*/
4141
public final class FileStructureFinderManager {
@@ -73,9 +73,9 @@ public final class FileStructureFinderManager {
7373
* These need to be ordered so that the more generic formats come after the more specific ones
7474
*/
7575
private static final List<FileStructureFinderFactory> ORDERED_STRUCTURE_FACTORIES = Collections.unmodifiableList(Arrays.asList(
76-
new JsonFileStructureFinderFactory(),
76+
new NdJsonFileStructureFinderFactory(),
7777
new XmlFileStructureFinderFactory(),
78-
// ND-JSON will often also be valid (although utterly weird) CSV, so JSON must come before CSV
78+
// NDJSON will often also be valid (although utterly weird) CSV, so NDJSON must come before CSV
7979
new DelimitedFileStructureFinderFactory(',', '"', 2, false),
8080
new DelimitedFileStructureFinderFactory('\t', '"', 2, false),
8181
new DelimitedFileStructureFinderFactory(';', '"', 4, false),
Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -25,16 +25,16 @@
2525
import static org.elasticsearch.common.xcontent.json.JsonXContent.jsonXContent;
2626

2727
/**
28-
* Really ND-JSON.
28+
* Newline-delimited JSON.
2929
*/
30-
public class JsonFileStructureFinder implements FileStructureFinder {
30+
public class NdJsonFileStructureFinder implements FileStructureFinder {
3131

3232
private final List<String> sampleMessages;
3333
private final FileStructure structure;
3434

35-
static JsonFileStructureFinder makeJsonFileStructureFinder(List<String> explanation, String sample, String charsetName,
36-
Boolean hasByteOrderMarker, FileStructureOverrides overrides,
37-
TimeoutChecker timeoutChecker) throws IOException {
35+
static NdJsonFileStructureFinder makeNdJsonFileStructureFinder(List<String> explanation, String sample, String charsetName,
36+
Boolean hasByteOrderMarker, FileStructureOverrides overrides,
37+
TimeoutChecker timeoutChecker) throws IOException {
3838

3939
List<Map<String, ?>> sampleRecords = new ArrayList<>();
4040

@@ -43,10 +43,10 @@ static JsonFileStructureFinder makeJsonFileStructureFinder(List<String> explanat
4343
XContentParser parser = jsonXContent.createParser(NamedXContentRegistry.EMPTY, DeprecationHandler.THROW_UNSUPPORTED_OPERATION,
4444
sampleMessage);
4545
sampleRecords.add(parser.mapOrdered());
46-
timeoutChecker.check("JSON parsing");
46+
timeoutChecker.check("NDJSON parsing");
4747
}
4848

49-
FileStructure.Builder structureBuilder = new FileStructure.Builder(FileStructure.Format.JSON)
49+
FileStructure.Builder structureBuilder = new FileStructure.Builder(FileStructure.Format.NDJSON)
5050
.setCharset(charsetName)
5151
.setHasByteOrderMarker(hasByteOrderMarker)
5252
.setSampleStart(sampleMessages.stream().limit(2).collect(Collectors.joining("\n", "", "\n")))
@@ -84,10 +84,10 @@ static JsonFileStructureFinder makeJsonFileStructureFinder(List<String> explanat
8484
.setExplanation(explanation)
8585
.build();
8686

87-
return new JsonFileStructureFinder(sampleMessages, structure);
87+
return new NdJsonFileStructureFinder(sampleMessages, structure);
8888
}
8989

90-
private JsonFileStructureFinder(List<String> sampleMessages, FileStructure structure) {
90+
private NdJsonFileStructureFinder(List<String> sampleMessages, FileStructure structure) {
9191
this.sampleMessages = Collections.unmodifiableList(sampleMessages);
9292
this.structure = structure;
9393
}
Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -17,15 +17,15 @@
1717

1818
import static org.elasticsearch.common.xcontent.json.JsonXContent.jsonXContent;
1919

20-
public class JsonFileStructureFinderFactory implements FileStructureFinderFactory {
20+
public class NdJsonFileStructureFinderFactory implements FileStructureFinderFactory {
2121

2222
@Override
2323
public boolean canFindFormat(FileStructure.Format format) {
24-
return format == null || format == FileStructure.Format.JSON;
24+
return format == null || format == FileStructure.Format.NDJSON;
2525
}
2626

2727
/**
28-
* This format matches if the sample consists of one or more JSON documents.
28+
* This format matches if the sample consists of one or more NDJSON documents.
2929
* If there is more than one, they must be newline-delimited. The
3030
* documents must be non-empty, to prevent lines containing "{}" from matching.
3131
*/
@@ -41,35 +41,35 @@ public boolean canCreateFromSample(List<String> explanation, String sample) {
4141
DeprecationHandler.THROW_UNSUPPORTED_OPERATION, new ContextPrintingStringReader(sampleLine))) {
4242

4343
if (parser.map().isEmpty()) {
44-
explanation.add("Not JSON because an empty object was parsed: [" + sampleLine + "]");
44+
explanation.add("Not NDJSON because an empty object was parsed: [" + sampleLine + "]");
4545
return false;
4646
}
4747
++completeDocCount;
4848
if (parser.nextToken() != null) {
49-
explanation.add("Not newline delimited JSON because a line contained more than a single object: [" +
49+
explanation.add("Not newline delimited NDJSON because a line contained more than a single object: [" +
5050
sampleLine + "]");
5151
return false;
5252
}
5353
}
5454
}
5555
} catch (IOException | IllegalStateException e) {
56-
explanation.add("Not JSON because there was a parsing exception: [" + e.getMessage().replaceAll("\\s?\r?\n\\s?", " ") + "]");
56+
explanation.add("Not NDJSON because there was a parsing exception: [" + e.getMessage().replaceAll("\\s?\r?\n\\s?", " ") + "]");
5757
return false;
5858
}
5959

6060
if (completeDocCount == 0) {
61-
explanation.add("Not JSON because sample didn't contain a complete document");
61+
explanation.add("Not NDJSON because sample didn't contain a complete document");
6262
return false;
6363
}
6464

65-
explanation.add("Deciding sample is newline delimited JSON");
65+
explanation.add("Deciding sample is newline delimited NDJSON");
6666
return true;
6767
}
6868

6969
@Override
7070
public FileStructureFinder createFromSample(List<String> explanation, String sample, String charsetName, Boolean hasByteOrderMarker,
7171
FileStructureOverrides overrides, TimeoutChecker timeoutChecker) throws IOException {
72-
return JsonFileStructureFinder.makeJsonFileStructureFinder(explanation, sample, charsetName, hasByteOrderMarker, overrides,
72+
return NdJsonFileStructureFinder.makeNdJsonFileStructureFinder(explanation, sample, charsetName, hasByteOrderMarker, overrides,
7373
timeoutChecker);
7474
}
7575

x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinderFactoryTests.java

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ public class DelimitedFileStructureFinderFactoryTests extends FileStructureTestC
1212
private FileStructureFinderFactory semiColonDelimitedfactory = new DelimitedFileStructureFinderFactory(';', '"', 4, false);
1313
private FileStructureFinderFactory pipeDelimitedFactory = new DelimitedFileStructureFinderFactory('|', '"', 5, true);
1414

15-
// CSV - no need to check JSON or XML because they come earlier in the order we check formats
15+
// CSV - no need to check NDJSON or XML because they come earlier in the order we check formats
1616

1717
public void testCanCreateCsvFromSampleGivenCsv() {
1818

@@ -39,7 +39,7 @@ public void testCanCreateCsvFromSampleGivenText() {
3939
assertFalse(csvFactory.canCreateFromSample(explanation, TEXT_SAMPLE));
4040
}
4141

42-
// TSV - no need to check JSON, XML or CSV because they come earlier in the order we check formats
42+
// TSV - no need to check NDJSON, XML or CSV because they come earlier in the order we check formats
4343

4444
public void testCanCreateTsvFromSampleGivenTsv() {
4545

@@ -61,7 +61,7 @@ public void testCanCreateTsvFromSampleGivenText() {
6161
assertFalse(tsvFactory.canCreateFromSample(explanation, TEXT_SAMPLE));
6262
}
6363

64-
// Semi-colon delimited - no need to check JSON, XML, CSV or TSV because they come earlier in the order we check formats
64+
// Semi-colon delimited - no need to check NDJSON, XML, CSV or TSV because they come earlier in the order we check formats
6565

6666
public void testCanCreateSemiColonDelimitedFromSampleGivenSemiColonDelimited() {
6767

@@ -78,7 +78,7 @@ public void testCanCreateSemiColonDelimitedFromSampleGivenText() {
7878
assertFalse(semiColonDelimitedfactory.canCreateFromSample(explanation, TEXT_SAMPLE));
7979
}
8080

81-
// Pipe delimited - no need to check JSON, XML, CSV, TSV or semi-colon delimited
81+
// Pipe delimited - no need to check NDJSON, XML, CSV, TSV or semi-colon delimited
8282
// values because they come earlier in the order we check formats
8383

8484
public void testCanCreatePipeDelimitedFromSampleGivenPipeDelimited() {

x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureFinderManagerTests.java

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -73,20 +73,20 @@ public void testFindCharsetGivenBinary() throws Exception {
7373
}
7474
}
7575

76-
public void testMakeBestStructureGivenJson() throws Exception {
77-
assertThat(structureFinderManager.makeBestStructureFinder(explanation, JSON_SAMPLE, StandardCharsets.UTF_8.name(), randomBoolean(),
78-
EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER), instanceOf(JsonFileStructureFinder.class));
76+
public void testMakeBestStructureGivenNdJson() throws Exception {
77+
assertThat(structureFinderManager.makeBestStructureFinder(explanation, NDJSON_SAMPLE, StandardCharsets.UTF_8.name(),
78+
randomBoolean(), EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER), instanceOf(NdJsonFileStructureFinder.class));
7979
}
8080

81-
public void testMakeBestStructureGivenJsonAndDelimitedOverride() throws Exception {
81+
public void testMakeBestStructureGivenNdJsonAndDelimitedOverride() throws Exception {
8282

8383
// Need to change the quote character from the default of double quotes
84-
// otherwise the quotes in the JSON will stop it parsing as CSV
84+
// otherwise the quotes in the NDJSON will stop it parsing as CSV
8585
FileStructureOverrides overrides = FileStructureOverrides.builder()
8686
.setFormat(FileStructure.Format.DELIMITED).setQuote('\'').build();
8787

88-
assertThat(structureFinderManager.makeBestStructureFinder(explanation, JSON_SAMPLE, StandardCharsets.UTF_8.name(), randomBoolean(),
89-
overrides, NOOP_TIMEOUT_CHECKER), instanceOf(DelimitedFileStructureFinder.class));
88+
assertThat(structureFinderManager.makeBestStructureFinder(explanation, NDJSON_SAMPLE, StandardCharsets.UTF_8.name(),
89+
randomBoolean(), overrides, NOOP_TIMEOUT_CHECKER), instanceOf(DelimitedFileStructureFinder.class));
9090
}
9191

9292
public void testMakeBestStructureGivenXml() throws Exception {
@@ -109,13 +109,13 @@ public void testMakeBestStructureGivenCsv() throws Exception {
109109

110110
public void testMakeBestStructureGivenCsvAndJsonOverride() {
111111

112-
FileStructureOverrides overrides = FileStructureOverrides.builder().setFormat(FileStructure.Format.JSON).build();
112+
FileStructureOverrides overrides = FileStructureOverrides.builder().setFormat(FileStructure.Format.NDJSON).build();
113113

114114
IllegalArgumentException e = expectThrows(IllegalArgumentException.class,
115115
() -> structureFinderManager.makeBestStructureFinder(explanation, CSV_SAMPLE, StandardCharsets.UTF_8.name(), randomBoolean(),
116116
overrides, NOOP_TIMEOUT_CHECKER));
117117

118-
assertEquals("Input did not match the specified format [json]", e.getMessage());
118+
assertEquals("Input did not match the specified format [ndjson]", e.getMessage());
119119
}
120120

121121
public void testMakeBestStructureGivenText() throws Exception {

x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureTestCase.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ public abstract class FileStructureTestCase extends ESTestCase {
2727
"2018-05-17T16:23:40,key1,42.0\n" +
2828
"2018-05-17T16:24:11,\"key with spaces\",42.0\n";
2929

30-
protected static final String JSON_SAMPLE = "{\"logger\":\"controller\",\"timestamp\":1478261151445,\"level\":\"INFO\"," +
30+
protected static final String NDJSON_SAMPLE = "{\"logger\":\"controller\",\"timestamp\":1478261151445,\"level\":\"INFO\"," +
3131
"\"pid\":42,\"thread\":\"0x7fff7d2a8000\",\"message\":\"message 1\",\"class\":\"ml\"," +
3232
"\"method\":\"core::SomeNoiseMaker\",\"file\":\"Noisemaker.cc\",\"line\":333}\n" +
3333
"{\"logger\":\"controller\",\"timestamp\":1478261151445," +
Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -9,21 +9,21 @@
99

1010
import java.util.Collections;
1111

12-
public class JsonFileStructureFinderTests extends FileStructureTestCase {
12+
public class NdJsonFileStructureFinderTests extends FileStructureTestCase {
1313

14-
private FileStructureFinderFactory factory = new JsonFileStructureFinderFactory();
14+
private FileStructureFinderFactory factory = new NdJsonFileStructureFinderFactory();
1515

1616
public void testCreateConfigsGivenGoodJson() throws Exception {
17-
assertTrue(factory.canCreateFromSample(explanation, JSON_SAMPLE));
17+
assertTrue(factory.canCreateFromSample(explanation, NDJSON_SAMPLE));
1818

1919
String charset = randomFrom(POSSIBLE_CHARSETS);
2020
Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset);
21-
FileStructureFinder structureFinder = factory.createFromSample(explanation, JSON_SAMPLE, charset, hasByteOrderMarker,
21+
FileStructureFinder structureFinder = factory.createFromSample(explanation, NDJSON_SAMPLE, charset, hasByteOrderMarker,
2222
FileStructureOverrides.EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER);
2323

2424
FileStructure structure = structureFinder.getStructure();
2525

26-
assertEquals(FileStructure.Format.JSON, structure.getFormat());
26+
assertEquals(FileStructure.Format.NDJSON, structure.getFormat());
2727
assertEquals(charset, structure.getCharset());
2828
if (hasByteOrderMarker == null) {
2929
assertNull(structure.getHasByteOrderMarker());

0 commit comments

Comments
 (0)