diff --git a/.gitattributes b/.gitattributes
index 47b4a52e5726e..c19ea2202f725 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -10,5 +10,6 @@
*.bcfks binary
*.crt binary
*.p12 binary
+*.ttf binary
*.txt text=auto
CHANGELOG.md merge=union
diff --git a/CHANGELOG.md b/CHANGELOG.md
index d0eeeaab39e52..aa3977b49ace3 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -43,6 +43,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
- Bump `com.google.auth:google-auth-library-oauth2-http` from 1.37.1 to 1.38.0 ([#19144](https://github.com/opensearch-project/OpenSearch/pull/19144))
- Bump `com.squareup.okio:okio` from 3.15.0 to 3.16.0 ([#19146](https://github.com/opensearch-project/OpenSearch/pull/19146))
- Bump Slf4j from 1.7.36 to 2.0.17 ([#19136](https://github.com/opensearch-project/OpenSearch/pull/19136))
+- Bump `org.apache.tika` from 2.9.2 to 3.2.2 ([#19125](https://github.com/opensearch-project/OpenSearch/pull/19125))
+- Bump `org.apache.commons:commons-compress` from 1.26.1 to 1.28.0 ([#19125](https://github.com/opensearch-project/OpenSearch/pull/19125))
### Deprecated
diff --git a/distribution/tools/plugin-cli/build.gradle b/distribution/tools/plugin-cli/build.gradle
index 8beb17bb8bf9a..41f80eb39a81f 100644
--- a/distribution/tools/plugin-cli/build.gradle
+++ b/distribution/tools/plugin-cli/build.gradle
@@ -81,5 +81,10 @@ thirdPartyAudit.ignoreMissingClasses(
'org.tukaani.xz.XZOutputStream',
'org.apache.commons.codec.digest.PureJavaCrc32C',
'org.apache.commons.codec.digest.XXHash32',
- 'org.apache.commons.lang3.reflect.FieldUtils'
+ 'org.apache.commons.lang3.reflect.FieldUtils',
+ 'org.apache.commons.lang3.ArrayFill',
+ 'org.apache.commons.lang3.ArrayUtils',
+ 'org.apache.commons.lang3.StringUtils',
+ 'org.apache.commons.lang3.SystemProperties',
+ 'org.apache.commons.lang3.function.Suppliers'
)
diff --git a/distribution/tools/plugin-cli/licenses/commons-compress-1.26.1.jar.sha1 b/distribution/tools/plugin-cli/licenses/commons-compress-1.26.1.jar.sha1
deleted file mode 100644
index 912bda85de18a..0000000000000
--- a/distribution/tools/plugin-cli/licenses/commons-compress-1.26.1.jar.sha1
+++ /dev/null
@@ -1 +0,0 @@
-44331c1130c370e726a2e1a3e6fba6d2558ef04a
\ No newline at end of file
diff --git a/distribution/tools/plugin-cli/licenses/commons-compress-1.28.0.jar.sha1 b/distribution/tools/plugin-cli/licenses/commons-compress-1.28.0.jar.sha1
new file mode 100644
index 0000000000000..5edae62aeeb5d
--- /dev/null
+++ b/distribution/tools/plugin-cli/licenses/commons-compress-1.28.0.jar.sha1
@@ -0,0 +1 @@
+e482f2c7a88dac3c497e96aa420b6a769f59c8d7
\ No newline at end of file
diff --git a/gradle/libs.versions.toml b/gradle/libs.versions.toml
index 40441dba894bb..7d0e2d31f0baf 100644
--- a/gradle/libs.versions.toml
+++ b/gradle/libs.versions.toml
@@ -50,7 +50,7 @@ httpasyncclient = "4.1.5"
commonslogging = "1.2"
commonscodec = "1.18.0"
commonslang = "3.18.0"
-commonscompress = "1.26.1"
+commonscompress = "1.28.0"
commonsio = "2.16.0"
# plugin dependencies
aws = "2.30.31"
diff --git a/plugins/ingest-attachment/build.gradle b/plugins/ingest-attachment/build.gradle
index f6a5f104cac79..0a6306be7daac 100644
--- a/plugins/ingest-attachment/build.gradle
+++ b/plugins/ingest-attachment/build.gradle
@@ -38,8 +38,8 @@ opensearchplugin {
}
versions << [
- 'tika' : '2.9.2',
- 'pdfbox': '2.0.31',
+ 'tika' : '3.2.2',
+ 'pdfbox': '3.0.5',
'poi' : '5.4.1',
'mime4j': '0.8.11'
]
@@ -75,10 +75,11 @@ dependencies {
// external parser libraries
// HTML
- api 'org.ccil.cowan.tagsoup:tagsoup:1.2.1'
+ api 'org.jsoup:jsoup:1.20.1'
// Adobe PDF
api "org.apache.pdfbox:pdfbox:${versions.pdfbox}"
api "org.apache.pdfbox:fontbox:${versions.pdfbox}"
+ api "org.apache.pdfbox:pdfbox-io:${versions.pdfbox}"
api "org.apache.pdfbox:jempbox:1.8.17"
api "commons-logging:commons-logging:${versions.commonslogging}"
// OpenOffice
@@ -121,6 +122,7 @@ forbiddenPatterns {
exclude '**/*.pdf'
exclude '**/*.epub'
exclude '**/*.vsdx'
+ exclude '**/*.ttf'
}
thirdPartyAudit {
diff --git a/plugins/ingest-attachment/licenses/Roboto-OFL.txt b/plugins/ingest-attachment/licenses/Roboto-OFL.txt
new file mode 100644
index 0000000000000..65a3057b1f24b
--- /dev/null
+++ b/plugins/ingest-attachment/licenses/Roboto-OFL.txt
@@ -0,0 +1,93 @@
+Copyright 2011 The Roboto Project Authors (https://github.com/googlefonts/roboto-classic)
+
+This Font Software is licensed under the SIL Open Font License, Version 1.1.
+This license is copied below, and is also available with a FAQ at:
+https://openfontlicense.org
+
+
+-----------------------------------------------------------
+SIL OPEN FONT LICENSE Version 1.1 - 26 February 2007
+-----------------------------------------------------------
+
+PREAMBLE
+The goals of the Open Font License (OFL) are to stimulate worldwide
+development of collaborative font projects, to support the font creation
+efforts of academic and linguistic communities, and to provide a free and
+open framework in which fonts may be shared and improved in partnership
+with others.
+
+The OFL allows the licensed fonts to be used, studied, modified and
+redistributed freely as long as they are not sold by themselves. The
+fonts, including any derivative works, can be bundled, embedded,
+redistributed and/or sold with any software provided that any reserved
+names are not used by derivative works. The fonts and derivatives,
+however, cannot be released under any other type of license. The
+requirement for fonts to remain under this license does not apply
+to any document created using the fonts or their derivatives.
+
+DEFINITIONS
+"Font Software" refers to the set of files released by the Copyright
+Holder(s) under this license and clearly marked as such. This may
+include source files, build scripts and documentation.
+
+"Reserved Font Name" refers to any names specified as such after the
+copyright statement(s).
+
+"Original Version" refers to the collection of Font Software components as
+distributed by the Copyright Holder(s).
+
+"Modified Version" refers to any derivative made by adding to, deleting,
+or substituting -- in part or in whole -- any of the components of the
+Original Version, by changing formats or by porting the Font Software to a
+new environment.
+
+"Author" refers to any designer, engineer, programmer, technical
+writer or other person who contributed to the Font Software.
+
+PERMISSION & CONDITIONS
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of the Font Software, to use, study, copy, merge, embed, modify,
+redistribute, and sell modified and unmodified copies of the Font
+Software, subject to the following conditions:
+
+1) Neither the Font Software nor any of its individual components,
+in Original or Modified Versions, may be sold by itself.
+
+2) Original or Modified Versions of the Font Software may be bundled,
+redistributed and/or sold with any software, provided that each copy
+contains the above copyright notice and this license. These can be
+included either as stand-alone text files, human-readable headers or
+in the appropriate machine-readable metadata fields within text or
+binary files as long as those fields can be easily viewed by the user.
+
+3) No Modified Version of the Font Software may use the Reserved Font
+Name(s) unless explicit written permission is granted by the corresponding
+Copyright Holder. This restriction only applies to the primary font name as
+presented to the users.
+
+4) The name(s) of the Copyright Holder(s) or the Author(s) of the Font
+Software shall not be used to promote, endorse or advertise any
+Modified Version, except to acknowledge the contribution(s) of the
+Copyright Holder(s) and the Author(s) or with their explicit written
+permission.
+
+5) The Font Software, modified or unmodified, in part or in whole,
+must be distributed entirely under this license, and must not be
+distributed under any other license. The requirement for fonts to
+remain under this license does not apply to any document created
+using the Font Software.
+
+TERMINATION
+This license becomes null and void if any of the above conditions are
+not met.
+
+DISCLAIMER
+THE FONT SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO ANY WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT
+OF COPYRIGHT, PATENT, TRADEMARK, OR OTHER RIGHT. IN NO EVENT SHALL THE
+COPYRIGHT HOLDER BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+INCLUDING ANY GENERAL, SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL
+DAMAGES, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF THE USE OR INABILITY TO USE THE FONT SOFTWARE OR FROM
+OTHER DEALINGS IN THE FONT SOFTWARE.
diff --git a/plugins/ingest-attachment/licenses/commons-compress-1.26.1.jar.sha1 b/plugins/ingest-attachment/licenses/commons-compress-1.26.1.jar.sha1
deleted file mode 100644
index 912bda85de18a..0000000000000
--- a/plugins/ingest-attachment/licenses/commons-compress-1.26.1.jar.sha1
+++ /dev/null
@@ -1 +0,0 @@
-44331c1130c370e726a2e1a3e6fba6d2558ef04a
\ No newline at end of file
diff --git a/plugins/ingest-attachment/licenses/commons-compress-1.28.0.jar.sha1 b/plugins/ingest-attachment/licenses/commons-compress-1.28.0.jar.sha1
new file mode 100644
index 0000000000000..5edae62aeeb5d
--- /dev/null
+++ b/plugins/ingest-attachment/licenses/commons-compress-1.28.0.jar.sha1
@@ -0,0 +1 @@
+e482f2c7a88dac3c497e96aa420b6a769f59c8d7
\ No newline at end of file
diff --git a/plugins/ingest-attachment/licenses/fontbox-2.0.31.jar.sha1 b/plugins/ingest-attachment/licenses/fontbox-2.0.31.jar.sha1
deleted file mode 100644
index d45d45a66e072..0000000000000
--- a/plugins/ingest-attachment/licenses/fontbox-2.0.31.jar.sha1
+++ /dev/null
@@ -1 +0,0 @@
-96999ecdb7324bf718b88724818fa62f81286c36
\ No newline at end of file
diff --git a/plugins/ingest-attachment/licenses/fontbox-3.0.5.jar.sha1 b/plugins/ingest-attachment/licenses/fontbox-3.0.5.jar.sha1
new file mode 100644
index 0000000000000..241eda72e6dae
--- /dev/null
+++ b/plugins/ingest-attachment/licenses/fontbox-3.0.5.jar.sha1
@@ -0,0 +1 @@
+b4a068e1dba2b9832a108cdf6e9a3249680e3ce8
\ No newline at end of file
diff --git a/plugins/ingest-attachment/licenses/jsoup-1.20.1.jar.sha1 b/plugins/ingest-attachment/licenses/jsoup-1.20.1.jar.sha1
new file mode 100644
index 0000000000000..9a2329562aae0
--- /dev/null
+++ b/plugins/ingest-attachment/licenses/jsoup-1.20.1.jar.sha1
@@ -0,0 +1 @@
+769377896610be1736f8d6d51fc52a6042d1ce82
\ No newline at end of file
diff --git a/plugins/ingest-attachment/licenses/jsoup-LICENSE.txt b/plugins/ingest-attachment/licenses/jsoup-LICENSE.txt
new file mode 100644
index 0000000000000..e4bf2be9fb7f2
--- /dev/null
+++ b/plugins/ingest-attachment/licenses/jsoup-LICENSE.txt
@@ -0,0 +1,21 @@
+The MIT License
+
+Copyright (c) 2009-2025 Jonathan Hedley
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/plugins/ingest-attachment/licenses/tagsoup-NOTICE.txt b/plugins/ingest-attachment/licenses/jsoup-NOTICE.txt
similarity index 100%
rename from plugins/ingest-attachment/licenses/tagsoup-NOTICE.txt
rename to plugins/ingest-attachment/licenses/jsoup-NOTICE.txt
diff --git a/plugins/ingest-attachment/licenses/pdfbox-2.0.31.jar.sha1 b/plugins/ingest-attachment/licenses/pdfbox-2.0.31.jar.sha1
deleted file mode 100644
index fa256ed9a65d2..0000000000000
--- a/plugins/ingest-attachment/licenses/pdfbox-2.0.31.jar.sha1
+++ /dev/null
@@ -1 +0,0 @@
-29b25053099bc30784a766ccb821417e06f4b8a1
\ No newline at end of file
diff --git a/plugins/ingest-attachment/licenses/pdfbox-3.0.5.jar.sha1 b/plugins/ingest-attachment/licenses/pdfbox-3.0.5.jar.sha1
new file mode 100644
index 0000000000000..6a6fad5245aa2
--- /dev/null
+++ b/plugins/ingest-attachment/licenses/pdfbox-3.0.5.jar.sha1
@@ -0,0 +1 @@
+c34109061c3a0d85d871d9edc469ac0682f81856
\ No newline at end of file
diff --git a/plugins/ingest-attachment/licenses/pdfbox-io-3.0.5.jar.sha1 b/plugins/ingest-attachment/licenses/pdfbox-io-3.0.5.jar.sha1
new file mode 100644
index 0000000000000..e70c851dbd9c2
--- /dev/null
+++ b/plugins/ingest-attachment/licenses/pdfbox-io-3.0.5.jar.sha1
@@ -0,0 +1 @@
+402151a8d1aa427ea879cc7160e9227e9f5088ba
\ No newline at end of file
diff --git a/plugins/ingest-attachment/licenses/tagsoup-LICENSE.txt b/plugins/ingest-attachment/licenses/pdfbox-io-LICENSE.txt
similarity index 60%
rename from plugins/ingest-attachment/licenses/tagsoup-LICENSE.txt
rename to plugins/ingest-attachment/licenses/pdfbox-io-LICENSE.txt
index 261eeb9e9f8b2..97553f24a432a 100644
--- a/plugins/ingest-attachment/licenses/tagsoup-LICENSE.txt
+++ b/plugins/ingest-attachment/licenses/pdfbox-io-LICENSE.txt
@@ -1,3 +1,4 @@
+
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
@@ -199,3 +200,145 @@
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
+
+EXTERNAL COMPONENTS
+
+Apache PDFBox includes a number of components with separate copyright notices
+and license terms. Your use of these components is subject to the terms and
+conditions of the following licenses.
+
+Contributions made to the original PDFBox and FontBox projects:
+
+ Copyright (c) 2002-2007, www.pdfbox.org
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright notice,
+ this list of conditions and the following disclaimer.
+
+ 2. Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ 3. Neither the name of pdfbox; nor the names of its contributors may be
+ used to endorse or promote products derived from this software without
+ specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ SUCH DAMAGE.
+
+Adobe Font Metrics (AFM) for PDF Core 14 Fonts
+
+ This file and the 14 PostScript(R) AFM files it accompanies may be used,
+ copied, and distributed for any purpose and without charge, with or without
+ modification, provided that all copyright notices are retained; that the
+ AFM files are not distributed without this file; that all modifications
+ to this file or any of the AFM files are prominently noted in the modified
+ file(s); and that this paragraph is not modified. Adobe Systems has no
+ responsibility or obligation to support the use of the AFM files.
+
+CMaps for PDF Fonts (http://opensource.adobe.com/wiki/display/cmap/Downloads)
+
+ Copyright 1990-2009 Adobe Systems Incorporated.
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ Redistributions of source code must retain the above copyright notice,
+ this list of conditions and the following disclaimer.
+
+ Redistributions in binary form must reproduce the above copyright notice,
+ this list of conditions and the following disclaimer in the documentation
+ and/or other materials provided with the distribution.
+
+ Neither the name of Adobe Systems Incorporated nor the names of its
+ contributors may be used to endorse or promote products derived from this
+ software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ THE POSSIBILITY OF SUCH DAMAGE.
+
+PaDaF PDF/A preflight (http://sourceforge.net/projects/padaf)
+
+ Copyright 2010 Atos Worldline SAS
+
+ Licensed by Atos Worldline SAS under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ Atos Worldline SAS licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+
+OSXAdapter
+
+ Version: 2.0
+
+ Disclaimer: IMPORTANT: This Apple software is supplied to you by
+ Apple Inc. ("Apple") in consideration of your agreement to the
+ following terms, and your use, installation, modification or
+ redistribution of this Apple software constitutes acceptance of these
+ terms. If you do not agree with these terms, please do not use,
+ install, modify or redistribute this Apple software.
+
+ In consideration of your agreement to abide by the following terms, and
+ subject to these terms, Apple grants you a personal, non-exclusive
+ license, under Apple's copyrights in this original Apple software (the
+ "Apple Software"), to use, reproduce, modify and redistribute the Apple
+ Software, with or without modifications, in source and/or binary forms;
+ provided that if you redistribute the Apple Software in its entirety and
+ without modifications, you must retain this notice and the following
+ text and disclaimers in all such redistributions of the Apple Software.
+ Neither the name, trademarks, service marks or logos of Apple Inc.
+ may be used to endorse or promote products derived from the Apple
+ Software without specific prior written permission from Apple. Except
+ as expressly stated in this notice, no other rights or licenses, express
+ or implied, are granted by Apple herein, including but not limited to
+ any patent rights that may be infringed by your derivative works or by
+ other works in which the Apple Software may be incorporated.
+
+ The Apple Software is provided by Apple on an "AS IS" basis. APPLE
+ MAKES NO WARRANTIES, EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION
+ THE IMPLIED WARRANTIES OF NON-INFRINGEMENT, MERCHANTABILITY AND FITNESS
+ FOR A PARTICULAR PURPOSE, REGARDING THE APPLE SOFTWARE OR ITS USE AND
+ OPERATION ALONE OR IN COMBINATION WITH YOUR PRODUCTS.
+
+ IN NO EVENT SHALL APPLE BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL
+ OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ INTERRUPTION) ARISING IN ANY WAY OUT OF THE USE, REPRODUCTION,
+ MODIFICATION AND/OR DISTRIBUTION OF THE APPLE SOFTWARE, HOWEVER CAUSED
+ AND WHETHER UNDER THEORY OF CONTRACT, TORT (INCLUDING NEGLIGENCE),
+ STRICT LIABILITY OR OTHERWISE, EVEN IF APPLE HAS BEEN ADVISED OF THE
+ POSSIBILITY OF SUCH DAMAGE.
+
+ Copyright (C) 2003-2007 Apple, Inc., All Rights Reserved
diff --git a/plugins/ingest-attachment/licenses/pdfbox-io-NOTICE.txt b/plugins/ingest-attachment/licenses/pdfbox-io-NOTICE.txt
new file mode 100644
index 0000000000000..3c85708256104
--- /dev/null
+++ b/plugins/ingest-attachment/licenses/pdfbox-io-NOTICE.txt
@@ -0,0 +1,22 @@
+Apache PDFBox
+Copyright 2014 The Apache Software Foundation
+
+This product includes software developed at
+The Apache Software Foundation (http://www.apache.org/).
+
+Based on source code originally developed in the PDFBox and
+FontBox projects.
+
+Copyright (c) 2002-2007, www.pdfbox.org
+
+Based on source code originally developed in the PaDaF project.
+Copyright (c) 2010 Atos Worldline SAS
+
+Includes the Adobe Glyph List
+Copyright 1997, 1998, 2002, 2007, 2010 Adobe Systems Incorporated.
+
+Includes the Zapf Dingbats Glyph List
+Copyright 2002, 2010 Adobe Systems Incorporated.
+
+Includes OSXAdapter
+Copyright (C) 2003-2007 Apple, Inc., All Rights Reserved
diff --git a/plugins/ingest-attachment/licenses/tagsoup-1.2.1.jar.sha1 b/plugins/ingest-attachment/licenses/tagsoup-1.2.1.jar.sha1
deleted file mode 100644
index 5d227b11a0fa6..0000000000000
--- a/plugins/ingest-attachment/licenses/tagsoup-1.2.1.jar.sha1
+++ /dev/null
@@ -1 +0,0 @@
-5584627487e984c03456266d3f8802eb85a9ce97
diff --git a/plugins/ingest-attachment/licenses/tika-core-2.9.2.jar.sha1 b/plugins/ingest-attachment/licenses/tika-core-2.9.2.jar.sha1
deleted file mode 100644
index 80635a63d29fe..0000000000000
--- a/plugins/ingest-attachment/licenses/tika-core-2.9.2.jar.sha1
+++ /dev/null
@@ -1 +0,0 @@
-796a21391780339e3d4862626339b49df170024e
\ No newline at end of file
diff --git a/plugins/ingest-attachment/licenses/tika-core-3.2.2.jar.sha1 b/plugins/ingest-attachment/licenses/tika-core-3.2.2.jar.sha1
new file mode 100644
index 0000000000000..01df6be02361e
--- /dev/null
+++ b/plugins/ingest-attachment/licenses/tika-core-3.2.2.jar.sha1
@@ -0,0 +1 @@
+f1f16ecac7a81e145051f906927ea6b58ce7e914
\ No newline at end of file
diff --git a/plugins/ingest-attachment/licenses/tika-langdetect-optimaize-2.9.2.jar.sha1 b/plugins/ingest-attachment/licenses/tika-langdetect-optimaize-2.9.2.jar.sha1
deleted file mode 100644
index a4bb6d48c6a08..0000000000000
--- a/plugins/ingest-attachment/licenses/tika-langdetect-optimaize-2.9.2.jar.sha1
+++ /dev/null
@@ -1 +0,0 @@
-7a48a287e464b456a85c79f318d7bad7db201518
\ No newline at end of file
diff --git a/plugins/ingest-attachment/licenses/tika-langdetect-optimaize-3.2.2.jar.sha1 b/plugins/ingest-attachment/licenses/tika-langdetect-optimaize-3.2.2.jar.sha1
new file mode 100644
index 0000000000000..b692ab8befa3b
--- /dev/null
+++ b/plugins/ingest-attachment/licenses/tika-langdetect-optimaize-3.2.2.jar.sha1
@@ -0,0 +1 @@
+3ee2907773fe2aaa1013829e00cd62778d6a2ff9
\ No newline at end of file
diff --git a/plugins/ingest-attachment/licenses/tika-parser-apple-module-2.9.2.jar.sha1 b/plugins/ingest-attachment/licenses/tika-parser-apple-module-2.9.2.jar.sha1
deleted file mode 100644
index dbaee880d1251..0000000000000
--- a/plugins/ingest-attachment/licenses/tika-parser-apple-module-2.9.2.jar.sha1
+++ /dev/null
@@ -1 +0,0 @@
-758dac27c246c51b019562bab7e266d2da6a6e01
\ No newline at end of file
diff --git a/plugins/ingest-attachment/licenses/tika-parser-apple-module-3.2.2.jar.sha1 b/plugins/ingest-attachment/licenses/tika-parser-apple-module-3.2.2.jar.sha1
new file mode 100644
index 0000000000000..7ef86ac18757b
--- /dev/null
+++ b/plugins/ingest-attachment/licenses/tika-parser-apple-module-3.2.2.jar.sha1
@@ -0,0 +1 @@
+fde21727740a39beead899c9ca6e642f92d86e3a
\ No newline at end of file
diff --git a/plugins/ingest-attachment/licenses/tika-parser-html-module-2.9.2.jar.sha1 b/plugins/ingest-attachment/licenses/tika-parser-html-module-2.9.2.jar.sha1
deleted file mode 100644
index b4806746301ef..0000000000000
--- a/plugins/ingest-attachment/licenses/tika-parser-html-module-2.9.2.jar.sha1
+++ /dev/null
@@ -1 +0,0 @@
-47f6a4c46b92616d14e82cd7ad4d05cb43077b83
\ No newline at end of file
diff --git a/plugins/ingest-attachment/licenses/tika-parser-html-module-3.2.2.jar.sha1 b/plugins/ingest-attachment/licenses/tika-parser-html-module-3.2.2.jar.sha1
new file mode 100644
index 0000000000000..351a9d6963000
--- /dev/null
+++ b/plugins/ingest-attachment/licenses/tika-parser-html-module-3.2.2.jar.sha1
@@ -0,0 +1 @@
+e6acd314da558703977a681661c215f3ef92dbbd
\ No newline at end of file
diff --git a/plugins/ingest-attachment/licenses/tika-parser-microsoft-module-2.9.2.jar.sha1 b/plugins/ingest-attachment/licenses/tika-parser-microsoft-module-2.9.2.jar.sha1
deleted file mode 100644
index da1ae42bac652..0000000000000
--- a/plugins/ingest-attachment/licenses/tika-parser-microsoft-module-2.9.2.jar.sha1
+++ /dev/null
@@ -1 +0,0 @@
-235a20823c02c699ce3d57f3d6b9550db05d91a9
\ No newline at end of file
diff --git a/plugins/ingest-attachment/licenses/tika-parser-microsoft-module-3.2.2.jar.sha1 b/plugins/ingest-attachment/licenses/tika-parser-microsoft-module-3.2.2.jar.sha1
new file mode 100644
index 0000000000000..bcc475b3f4c1d
--- /dev/null
+++ b/plugins/ingest-attachment/licenses/tika-parser-microsoft-module-3.2.2.jar.sha1
@@ -0,0 +1 @@
+41ff68abccde91ab17d7b181eb7a5fccf16e8b5c
\ No newline at end of file
diff --git a/plugins/ingest-attachment/licenses/tika-parser-miscoffice-module-2.9.2.jar.sha1 b/plugins/ingest-attachment/licenses/tika-parser-miscoffice-module-2.9.2.jar.sha1
deleted file mode 100644
index 7ceed9e1643b8..0000000000000
--- a/plugins/ingest-attachment/licenses/tika-parser-miscoffice-module-2.9.2.jar.sha1
+++ /dev/null
@@ -1 +0,0 @@
-7688a4220d07c32b505230479f957cd495c0bef2
\ No newline at end of file
diff --git a/plugins/ingest-attachment/licenses/tika-parser-miscoffice-module-3.2.2.jar.sha1 b/plugins/ingest-attachment/licenses/tika-parser-miscoffice-module-3.2.2.jar.sha1
new file mode 100644
index 0000000000000..a7ac03630fe9c
--- /dev/null
+++ b/plugins/ingest-attachment/licenses/tika-parser-miscoffice-module-3.2.2.jar.sha1
@@ -0,0 +1 @@
+d4078f950ca55c5235cdfcad744235242f9edc05
\ No newline at end of file
diff --git a/plugins/ingest-attachment/licenses/tika-parser-pdf-module-2.9.2.jar.sha1 b/plugins/ingest-attachment/licenses/tika-parser-pdf-module-2.9.2.jar.sha1
deleted file mode 100644
index e780c1b92d525..0000000000000
--- a/plugins/ingest-attachment/licenses/tika-parser-pdf-module-2.9.2.jar.sha1
+++ /dev/null
@@ -1 +0,0 @@
-4d0f0e3f6eff184040402094f4fabbb3c5c7d09f
\ No newline at end of file
diff --git a/plugins/ingest-attachment/licenses/tika-parser-pdf-module-3.2.2.jar.sha1 b/plugins/ingest-attachment/licenses/tika-parser-pdf-module-3.2.2.jar.sha1
new file mode 100644
index 0000000000000..c9baba749d403
--- /dev/null
+++ b/plugins/ingest-attachment/licenses/tika-parser-pdf-module-3.2.2.jar.sha1
@@ -0,0 +1 @@
+a972d70ef0762b460c048c5e0e8a46c46bb170aa
\ No newline at end of file
diff --git a/plugins/ingest-attachment/licenses/tika-parser-text-module-2.9.2.jar.sha1 b/plugins/ingest-attachment/licenses/tika-parser-text-module-2.9.2.jar.sha1
deleted file mode 100644
index 6e56fcffc5f88..0000000000000
--- a/plugins/ingest-attachment/licenses/tika-parser-text-module-2.9.2.jar.sha1
+++ /dev/null
@@ -1 +0,0 @@
-b3a93e538ba6cb4066aba96d629febf181ec9f92
\ No newline at end of file
diff --git a/plugins/ingest-attachment/licenses/tika-parser-text-module-3.2.2.jar.sha1 b/plugins/ingest-attachment/licenses/tika-parser-text-module-3.2.2.jar.sha1
new file mode 100644
index 0000000000000..c84219d17252b
--- /dev/null
+++ b/plugins/ingest-attachment/licenses/tika-parser-text-module-3.2.2.jar.sha1
@@ -0,0 +1 @@
+a19be47ecca1a061349dc2d019ab6f2741ff1dee
\ No newline at end of file
diff --git a/plugins/ingest-attachment/licenses/tika-parser-xml-module-2.9.2.jar.sha1 b/plugins/ingest-attachment/licenses/tika-parser-xml-module-2.9.2.jar.sha1
deleted file mode 100644
index 27062077b92bf..0000000000000
--- a/plugins/ingest-attachment/licenses/tika-parser-xml-module-2.9.2.jar.sha1
+++ /dev/null
@@ -1 +0,0 @@
-ff707716c0c4748ffeb21996aefa8d269b3eab5b
\ No newline at end of file
diff --git a/plugins/ingest-attachment/licenses/tika-parser-xml-module-3.2.2.jar.sha1 b/plugins/ingest-attachment/licenses/tika-parser-xml-module-3.2.2.jar.sha1
new file mode 100644
index 0000000000000..e63b0f71f2d19
--- /dev/null
+++ b/plugins/ingest-attachment/licenses/tika-parser-xml-module-3.2.2.jar.sha1
@@ -0,0 +1 @@
+9dd2f1c52ab2663600e82dae3a8003ce6ede372f
\ No newline at end of file
diff --git a/plugins/ingest-attachment/licenses/tika-parser-xmp-commons-2.9.2.jar.sha1 b/plugins/ingest-attachment/licenses/tika-parser-xmp-commons-2.9.2.jar.sha1
deleted file mode 100644
index 396e2655b14db..0000000000000
--- a/plugins/ingest-attachment/licenses/tika-parser-xmp-commons-2.9.2.jar.sha1
+++ /dev/null
@@ -1 +0,0 @@
-69104107ff85194df5acf682178128771863e442
\ No newline at end of file
diff --git a/plugins/ingest-attachment/licenses/tika-parser-xmp-commons-3.2.2.jar.sha1 b/plugins/ingest-attachment/licenses/tika-parser-xmp-commons-3.2.2.jar.sha1
new file mode 100644
index 0000000000000..98b09c1785d78
--- /dev/null
+++ b/plugins/ingest-attachment/licenses/tika-parser-xmp-commons-3.2.2.jar.sha1
@@ -0,0 +1 @@
+f1dfa02a2c672153013d44501e0c21d5682aa822
\ No newline at end of file
diff --git a/plugins/ingest-attachment/licenses/tika-parser-zip-commons-2.9.2.jar.sha1 b/plugins/ingest-attachment/licenses/tika-parser-zip-commons-2.9.2.jar.sha1
deleted file mode 100644
index bda62033e4e8c..0000000000000
--- a/plugins/ingest-attachment/licenses/tika-parser-zip-commons-2.9.2.jar.sha1
+++ /dev/null
@@ -1 +0,0 @@
-2fcea85a56f93a5c0cb81f3d6dd8673f3d81c598
\ No newline at end of file
diff --git a/plugins/ingest-attachment/licenses/tika-parser-zip-commons-3.2.2.jar.sha1 b/plugins/ingest-attachment/licenses/tika-parser-zip-commons-3.2.2.jar.sha1
new file mode 100644
index 0000000000000..ac860449a84dd
--- /dev/null
+++ b/plugins/ingest-attachment/licenses/tika-parser-zip-commons-3.2.2.jar.sha1
@@ -0,0 +1 @@
+d46b71ea5697f575c3febfd7343e5d8b2c338bd5
\ No newline at end of file
diff --git a/plugins/ingest-attachment/licenses/tika-parsers-standard-package-2.9.2.jar.sha1 b/plugins/ingest-attachment/licenses/tika-parsers-standard-package-2.9.2.jar.sha1
deleted file mode 100644
index bb76974b6344e..0000000000000
--- a/plugins/ingest-attachment/licenses/tika-parsers-standard-package-2.9.2.jar.sha1
+++ /dev/null
@@ -1 +0,0 @@
-c8408deb51fa617ef4e912b4d161712e695d3a29
\ No newline at end of file
diff --git a/plugins/ingest-attachment/licenses/tika-parsers-standard-package-3.2.2.jar.sha1 b/plugins/ingest-attachment/licenses/tika-parsers-standard-package-3.2.2.jar.sha1
new file mode 100644
index 0000000000000..f6e9d188908cd
--- /dev/null
+++ b/plugins/ingest-attachment/licenses/tika-parsers-standard-package-3.2.2.jar.sha1
@@ -0,0 +1 @@
+c91fb85f5ee46e2c1f1e3399b04efb9d1ff85485
\ No newline at end of file
diff --git a/plugins/ingest-attachment/src/main/java/org/opensearch/ingest/attachment/TikaImpl.java b/plugins/ingest-attachment/src/main/java/org/opensearch/ingest/attachment/TikaImpl.java
index d999d20537485..068f1ae5d6d78 100644
--- a/plugins/ingest-attachment/src/main/java/org/opensearch/ingest/attachment/TikaImpl.java
+++ b/plugins/ingest-attachment/src/main/java/org/opensearch/ingest/attachment/TikaImpl.java
@@ -32,6 +32,16 @@
package org.opensearch.ingest.attachment;
+import org.apache.fontbox.FontBoxFont;
+import org.apache.fontbox.ttf.TTFParser;
+import org.apache.fontbox.ttf.TrueTypeFont;
+import org.apache.pdfbox.io.RandomAccessReadBuffer;
+import org.apache.pdfbox.pdmodel.font.CIDFontMapping;
+import org.apache.pdfbox.pdmodel.font.FontMapper;
+import org.apache.pdfbox.pdmodel.font.FontMappers;
+import org.apache.pdfbox.pdmodel.font.FontMapping;
+import org.apache.pdfbox.pdmodel.font.PDCIDSystemInfo;
+import org.apache.pdfbox.pdmodel.font.PDFontDescriptor;
import org.apache.tika.Tika;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
@@ -47,6 +57,7 @@
import java.io.ByteArrayInputStream;
import java.io.IOException;
+import java.io.InputStream;
import java.io.UncheckedIOException;
import java.lang.reflect.ReflectPermission;
import java.net.URISyntaxException;
@@ -75,6 +86,44 @@
*/
final class TikaImpl {
+ static {
+ /*
+ * Stop PDFBox from consulting the OS for fonts at all, use classpath instead with dummy fonts because font
+ * does not matter for ingestion
+ */
+ FontMappers.set(new FontMapper() {
+ @Override
+ public FontMapping getTrueTypeFont(String baseFont, PDFontDescriptor fd) {
+ try (InputStream in = TikaImpl.class.getResourceAsStream("/fonts/Roboto-Regular.ttf")) {
+ if (in == null) return new FontMapping<>(null, true);
+ byte[] bytes = in.readAllBytes();
+ TrueTypeFont ttf = new TTFParser().parse(new RandomAccessReadBuffer(bytes));
+ return new FontMapping<>(ttf, true);
+ } catch (IOException e) {
+ return new FontMapping<>(null, true);
+ }
+ }
+
+ @Override
+ public FontMapping getFontBoxFont(String baseFont, PDFontDescriptor fd) {
+ try (InputStream in = TikaImpl.class.getResourceAsStream("/fonts/Roboto-Regular.ttf")) {
+ if (in == null) return new FontMapping<>(null, true);
+ byte[] bytes = in.readAllBytes();
+ TrueTypeFont ttf = new TTFParser().parse(new RandomAccessReadBuffer(bytes));
+ return new FontMapping<>(ttf, true);
+ } catch (IOException e) {
+ return new FontMapping<>(null, true);
+ }
+ }
+
+ @Override
+ public CIDFontMapping getCIDFont(String baseFont, PDFontDescriptor fd, PDCIDSystemInfo cid) {
+ // No CID substitutions from the OS either; signal "fallback only".
+ return new CIDFontMapping(null, null, true);
+ }
+ });
+ }
+
/** Exclude some formats */
private static final Set EXCLUDES = new HashSet<>(
Arrays.asList(
@@ -91,7 +140,7 @@ final class TikaImpl {
/** subset of parsers for types we support */
private static final Parser PARSERS[] = new Parser[] {
// documents
- new org.apache.tika.parser.html.HtmlParser(),
+ new org.apache.tika.parser.html.JSoupParser(),
new org.apache.tika.parser.pdf.PDFParser(),
new org.apache.tika.parser.txt.TXTParser(),
new org.apache.tika.parser.microsoft.rtf.RTFParser(),
diff --git a/plugins/ingest-attachment/src/main/resources/fonts/Roboto-Regular.ttf b/plugins/ingest-attachment/src/main/resources/fonts/Roboto-Regular.ttf
new file mode 100644
index 0000000000000..7e3bb2f8ce7ae
Binary files /dev/null and b/plugins/ingest-attachment/src/main/resources/fonts/Roboto-Regular.ttf differ
diff --git a/plugins/ingest-attachment/src/test/resources/org/opensearch/ingest/attachment/test/.checksums b/plugins/ingest-attachment/src/test/resources/org/opensearch/ingest/attachment/test/.checksums
index 227d7d833a231..cbbf7dc49bd8e 100644
--- a/plugins/ingest-attachment/src/test/resources/org/opensearch/ingest/attachment/test/.checksums
+++ b/plugins/ingest-attachment/src/test/resources/org/opensearch/ingest/attachment/test/.checksums
@@ -3,7 +3,7 @@
"testWORD_1img.docx": "367e2ade13ca3c19bcd8a323e21d51d407e017ac",
"testMasterFooter.odp": "bcc59df70699c739423a50e362c722b81ae76498",
"testTXTNonASCIIUTF8.txt": "1ef514431ca8d838f11e99f8e4a0637730b77aa0",
- "EmbeddedOutlook.docx": "c544a6765c19ba11b0bf3edb55c79e1bd8565c6e",
+ "EmbeddedOutlook.docx": "770c14c1f8d1cb3ff431a6ea7d0cbd9f5091f1f5",
"testWORD_override_list_numbering.docx": "4e892319b921322916225def763f451e4bbb4e16",
"testTextBoxes.key": "b01581d5bd2483ce649a1a1406136359f4b93167",
"testPPT_masterText.pptx": "9fee8337b76dc3e196f4554dcde22b9dd1c3b3e8",
@@ -64,9 +64,9 @@
"testRTFTableCellSeparation2.rtf": "62782ca40ff0ed6c3ba90f8055ee724b44af203f",
"testPagesHeadersFootersRomanLower.pages": "2410fc803907001eb39c201ad4184b243e271c6d",
"headerPic.docx": "c704bb648feac7975dff1024a5f762325be7cbc2",
- "testHTMLNoisyMetaEncoding_4.html": "630e14e3495a78580c4e26fa3bbe3123ccf4fd8a",
+ "testHTMLNoisyMetaEncoding_4.html": "83d08bacf04d72f04b9ac67df81e9e63a891d744",
"testRTFBoldItalic.rtf": "0475d224078682cf3f9f3f4cbc14a63456c5a0d8",
- "test-outlook.msg": "1f202fc11a873e305d5b4d4607409f3f734065ec",
+ "test-outlook.msg": "ef14d2bbbe167b5d3500dcab3950cfa22cd94665",
"testRTFVarious.rtf": "bf6ea9cf57886e680c5e6743a66a12b950a09083",
"testXHTML.html": "c6da900f81c1c550518e65d579d3dd62dd7c5c0c",
"EmbeddedPDF.docx": "454476bdf4a968189a6f53e75c146382bf58a434",
@@ -101,13 +101,13 @@
"testWORD_override_list_numbering.doc": "60e47a3e71ba08af20af96131d61740a1f0bafa3",
"testPDF_twoAuthors.pdf": "c5f0296cc21f9ae99ceb649b561c55f99d7d9452",
"testPDF_Version.10.x.pdf": "03b60dfc8c103dbabeedfd682e979f96dd8983a2",
- "testHTMLNoisyMetaEncoding_2.html": "630e14e3495a78580c4e26fa3bbe3123ccf4fd8a",
+ "testHTMLNoisyMetaEncoding_2.html": "83d08bacf04d72f04b9ac67df81e9e63a891d744",
"testFooter.odt": "cd5d0fcbcf48d6f005d087c47d00e84f39bcc321",
"testPPT.pptm": "71333ef84f7825d8ad6aba2ba993d04b4bab41c6",
"testPPT_various.ppt": "399e27a9893284f106dc44f15b5e636454db681e",
"testRTFListMicrosoftWord.rtf": "0303eb3e2f30530621a7a407847b759a3b21467e",
"testWORD_bold_character_runs2.doc": "f10e562d8825ec2e17e0d9f58646f8084a658cfa",
- "boilerplate-whitespace.html": "a9372bc75d7d84cbcbb0bce68fcaed73ad8ef52c",
+ "boilerplate-whitespace.html": "bf1fd3ffcf798afd688254bbc899e388eda9e546",
"testEXCEL_95.xls": "20d9b9b0f3aecd28607516b4b837c8bab3524b6c",
"testPPT_embedded_two_slides.pptx": "",
"testPDF_bookmarks.pdf": "5fc486c443511452db4f1aa6530714c6aa49c831",
@@ -121,14 +121,14 @@
"testPDF_Version.4.x.pdf": "03b60dfc8c103dbabeedfd682e979f96dd8983a2",
"testBinControlWord.rtf": "ef858fbb7584ea7f92ffed8d0a08c1cc35ffee07",
"testWORD_null_style.docx": "0be9dcfb83423c78a06af514ec21e4e7770ec48e",
- "test-outlook2003.msg": "bb3c35eb7e95d657d7977c1d3d52862734f9f329",
+ "test-outlook2003.msg": "b9c21661a59254c8d6a9b665e28070757a354cbe",
"testPDFVarious.pdf": "c66bbbacb10dd27430f7d0bed9518e75793cedae",
- "testHTMLNoisyMetaEncoding_3.html": "630e14e3495a78580c4e26fa3bbe3123ccf4fd8a",
+ "testHTMLNoisyMetaEncoding_3.html": "83d08bacf04d72f04b9ac67df81e9e63a891d744",
"testRTFCorruptListOverride.rtf": "116a782d02a7f25010a15cbbb189bf98e6b89855",
"testEXCEL_custom_props.xls": "b5584d9b13ab1566ce539238dc75e7eb3449ba7f",
"testPDF_Version.7.x.pdf": "03b60dfc8c103dbabeedfd682e979f96dd8983a2",
"testPDFEmbeddingAndEmbedded.docx": "e7b648adb15cd16cdd84437c2b9524a8eeb213e4",
- "testHTMLNoisyMetaEncoding_1.html": "630e14e3495a78580c4e26fa3bbe3123ccf4fd8a",
+ "testHTMLNoisyMetaEncoding_1.html": "83d08bacf04d72f04b9ac67df81e9e63a891d744",
"testWORD_3imgs.doc": "818aa8c6c44dd78c49100c3c38e95abdf3812981",
"testRTFEmbeddedLink.rtf": "2720ffb5ff3a6bbb2c5c1cb43fb4922362ed788a",
"testKeynote.key": "11387b59fc6339bb73653fcbb26d387521b98ec9",
@@ -156,7 +156,7 @@
"testWORD_custom_props.doc": "e7a737a5237a6aa9c6b3fc677eb8fa65c30d6dfe",
"testPDF_Version.11.x.PDFA-1b.pdf": "71853c6197a6a7f222db0f1978c7cb232b87c5ee",
"testAnnotations.pdf": "5f599e7916198540e1b52c3e472a525f50fd45f6",
- "tika434.html": "7d74122631f52f003a48018cc376026ccd8d984e",
+ "tika434.html": "51cafe6636423e37c05e676cb1454e72961b8f04",
"testPagesHeadersFootersAlphaLower.pages": "fc1d766908134ff4689fa63fa3e91c3e9b08d975",
"testRTFRegularImages.rtf": "756b1db45cb05357ceaf9c8efcf0b76e3913e190",
"testRTFUmlautSpaces2.rtf": "1fcd029357062241d74d789e93477c101ff24e3f",
@@ -166,7 +166,7 @@
"testMasterSlideTable.key": "1d61e2fa3c3f3615500c7f72f62971391b9e9a2f",
"testWORD_various.doc": "8cbdf1a4e0d78471eb90403612c4e92866acf0cb",
"testEXCEL_textbox.xlsx": "1e81121e91e58a74d838e414ae0fc0055a4b4100",
- "big-preamble.html": "a9d759b46b6c6c1857d0d89c3a75ee2f3ace70c9",
+ "big-preamble.html": "edecdb8304a31bca1a71faab2153fa133989e6d8",
"testWORD.docx": "f72140bef19475e950e56084d1ab1cb926697b19",
"testComment.rtf": "f6351d0f1f20c4ee0fff70adca6abbc6e638610e",
"testRTFUnicodeUCNControlWordCharacterDoubling.rtf": "3e6f2f38682e38ffc96a476ca51bec2291a27fa7",
@@ -190,7 +190,7 @@
"testRTFIgnoredControlWord.rtf": "1eb6a2f2fd32b1bb4227c0c02a35cb6027d9ec8c",
"testComment.xls": "4de962f16452159ce302fc4a412b06a06cf9a0f6",
"testPPT.ppsm": "71333ef84f7825d8ad6aba2ba993d04b4bab41c6",
- "boilerplate.html": "b3558f02c3179e4aeeb6057594d87bda79964e7b",
+ "boilerplate.html": "f1e3c82a4f16f67590a5afe4b64d90d98330d216",
"testEXCEL_embeded.xls": "",
"testEXCEL.xlsx": "",
"testPPT_2imgs.ppt": "9a68072ffcf171389e78cf8bc018c4b568a6202d",
diff --git a/plugins/ingestion-kafka/build.gradle b/plugins/ingestion-kafka/build.gradle
index 9ba91190944dc..abd1b1a5c038c 100644
--- a/plugins/ingestion-kafka/build.gradle
+++ b/plugins/ingestion-kafka/build.gradle
@@ -41,6 +41,7 @@ dependencies {
testImplementation "org.testcontainers:kafka:${versions.testcontainers}"
testImplementation "org.rnorth.duct-tape:duct-tape:${versions.ducttape}"
testImplementation "org.apache.commons:commons-compress:${versions.commonscompress}"
+ testImplementation "org.apache.commons:commons-lang3:${versions.commonslang}"
testImplementation "commons-io:commons-io:${versions.commonsio}"
testImplementation 'org.awaitility:awaitility:4.2.0'
}
diff --git a/plugins/repository-hdfs/licenses/commons-compress-1.26.1.jar.sha1 b/plugins/repository-hdfs/licenses/commons-compress-1.26.1.jar.sha1
deleted file mode 100644
index 912bda85de18a..0000000000000
--- a/plugins/repository-hdfs/licenses/commons-compress-1.26.1.jar.sha1
+++ /dev/null
@@ -1 +0,0 @@
-44331c1130c370e726a2e1a3e6fba6d2558ef04a
\ No newline at end of file
diff --git a/plugins/repository-hdfs/licenses/commons-compress-1.28.0.jar.sha1 b/plugins/repository-hdfs/licenses/commons-compress-1.28.0.jar.sha1
new file mode 100644
index 0000000000000..5edae62aeeb5d
--- /dev/null
+++ b/plugins/repository-hdfs/licenses/commons-compress-1.28.0.jar.sha1
@@ -0,0 +1 @@
+e482f2c7a88dac3c497e96aa420b6a769f59c8d7
\ No newline at end of file