diff --git a/.gitattributes b/.gitattributes index 47b4a52e5726e..c19ea2202f725 100644 --- a/.gitattributes +++ b/.gitattributes @@ -10,5 +10,6 @@ *.bcfks binary *.crt binary *.p12 binary +*.ttf binary *.txt text=auto CHANGELOG.md merge=union diff --git a/CHANGELOG.md b/CHANGELOG.md index d0eeeaab39e52..aa3977b49ace3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -43,6 +43,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), - Bump `com.google.auth:google-auth-library-oauth2-http` from 1.37.1 to 1.38.0 ([#19144](https://github.com/opensearch-project/OpenSearch/pull/19144)) - Bump `com.squareup.okio:okio` from 3.15.0 to 3.16.0 ([#19146](https://github.com/opensearch-project/OpenSearch/pull/19146)) - Bump Slf4j from 1.7.36 to 2.0.17 ([#19136](https://github.com/opensearch-project/OpenSearch/pull/19136)) +- Bump `org.apache.tika` from 2.9.2 to 3.2.2 ([#19125](https://github.com/opensearch-project/OpenSearch/pull/19125)) +- Bump `org.apache.commons:commons-compress` from 1.26.1 to 1.28.0 ([#19125](https://github.com/opensearch-project/OpenSearch/pull/19125)) ### Deprecated diff --git a/distribution/tools/plugin-cli/build.gradle b/distribution/tools/plugin-cli/build.gradle index 8beb17bb8bf9a..41f80eb39a81f 100644 --- a/distribution/tools/plugin-cli/build.gradle +++ b/distribution/tools/plugin-cli/build.gradle @@ -81,5 +81,10 @@ thirdPartyAudit.ignoreMissingClasses( 'org.tukaani.xz.XZOutputStream', 'org.apache.commons.codec.digest.PureJavaCrc32C', 'org.apache.commons.codec.digest.XXHash32', - 'org.apache.commons.lang3.reflect.FieldUtils' + 'org.apache.commons.lang3.reflect.FieldUtils', + 'org.apache.commons.lang3.ArrayFill', + 'org.apache.commons.lang3.ArrayUtils', + 'org.apache.commons.lang3.StringUtils', + 'org.apache.commons.lang3.SystemProperties', + 'org.apache.commons.lang3.function.Suppliers' ) diff --git a/distribution/tools/plugin-cli/licenses/commons-compress-1.26.1.jar.sha1 b/distribution/tools/plugin-cli/licenses/commons-compress-1.26.1.jar.sha1 deleted file mode 100644 index 912bda85de18a..0000000000000 --- a/distribution/tools/plugin-cli/licenses/commons-compress-1.26.1.jar.sha1 +++ /dev/null @@ -1 +0,0 @@ -44331c1130c370e726a2e1a3e6fba6d2558ef04a \ No newline at end of file diff --git a/distribution/tools/plugin-cli/licenses/commons-compress-1.28.0.jar.sha1 b/distribution/tools/plugin-cli/licenses/commons-compress-1.28.0.jar.sha1 new file mode 100644 index 0000000000000..5edae62aeeb5d --- /dev/null +++ b/distribution/tools/plugin-cli/licenses/commons-compress-1.28.0.jar.sha1 @@ -0,0 +1 @@ +e482f2c7a88dac3c497e96aa420b6a769f59c8d7 \ No newline at end of file diff --git a/gradle/libs.versions.toml b/gradle/libs.versions.toml index 40441dba894bb..7d0e2d31f0baf 100644 --- a/gradle/libs.versions.toml +++ b/gradle/libs.versions.toml @@ -50,7 +50,7 @@ httpasyncclient = "4.1.5" commonslogging = "1.2" commonscodec = "1.18.0" commonslang = "3.18.0" -commonscompress = "1.26.1" +commonscompress = "1.28.0" commonsio = "2.16.0" # plugin dependencies aws = "2.30.31" diff --git a/plugins/ingest-attachment/build.gradle b/plugins/ingest-attachment/build.gradle index f6a5f104cac79..0a6306be7daac 100644 --- a/plugins/ingest-attachment/build.gradle +++ b/plugins/ingest-attachment/build.gradle @@ -38,8 +38,8 @@ opensearchplugin { } versions << [ - 'tika' : '2.9.2', - 'pdfbox': '2.0.31', + 'tika' : '3.2.2', + 'pdfbox': '3.0.5', 'poi' : '5.4.1', 'mime4j': '0.8.11' ] @@ -75,10 +75,11 @@ dependencies { // external parser libraries // HTML - api 'org.ccil.cowan.tagsoup:tagsoup:1.2.1' + api 'org.jsoup:jsoup:1.20.1' // Adobe PDF api "org.apache.pdfbox:pdfbox:${versions.pdfbox}" api "org.apache.pdfbox:fontbox:${versions.pdfbox}" + api "org.apache.pdfbox:pdfbox-io:${versions.pdfbox}" api "org.apache.pdfbox:jempbox:1.8.17" api "commons-logging:commons-logging:${versions.commonslogging}" // OpenOffice @@ -121,6 +122,7 @@ forbiddenPatterns { exclude '**/*.pdf' exclude '**/*.epub' exclude '**/*.vsdx' + exclude '**/*.ttf' } thirdPartyAudit { diff --git a/plugins/ingest-attachment/licenses/Roboto-OFL.txt b/plugins/ingest-attachment/licenses/Roboto-OFL.txt new file mode 100644 index 0000000000000..65a3057b1f24b --- /dev/null +++ b/plugins/ingest-attachment/licenses/Roboto-OFL.txt @@ -0,0 +1,93 @@ +Copyright 2011 The Roboto Project Authors (https://github.com/googlefonts/roboto-classic) + +This Font Software is licensed under the SIL Open Font License, Version 1.1. +This license is copied below, and is also available with a FAQ at: +https://openfontlicense.org + + +----------------------------------------------------------- +SIL OPEN FONT LICENSE Version 1.1 - 26 February 2007 +----------------------------------------------------------- + +PREAMBLE +The goals of the Open Font License (OFL) are to stimulate worldwide +development of collaborative font projects, to support the font creation +efforts of academic and linguistic communities, and to provide a free and +open framework in which fonts may be shared and improved in partnership +with others. + +The OFL allows the licensed fonts to be used, studied, modified and +redistributed freely as long as they are not sold by themselves. The +fonts, including any derivative works, can be bundled, embedded, +redistributed and/or sold with any software provided that any reserved +names are not used by derivative works. The fonts and derivatives, +however, cannot be released under any other type of license. The +requirement for fonts to remain under this license does not apply +to any document created using the fonts or their derivatives. + +DEFINITIONS +"Font Software" refers to the set of files released by the Copyright +Holder(s) under this license and clearly marked as such. This may +include source files, build scripts and documentation. + +"Reserved Font Name" refers to any names specified as such after the +copyright statement(s). + +"Original Version" refers to the collection of Font Software components as +distributed by the Copyright Holder(s). + +"Modified Version" refers to any derivative made by adding to, deleting, +or substituting -- in part or in whole -- any of the components of the +Original Version, by changing formats or by porting the Font Software to a +new environment. + +"Author" refers to any designer, engineer, programmer, technical +writer or other person who contributed to the Font Software. + +PERMISSION & CONDITIONS +Permission is hereby granted, free of charge, to any person obtaining +a copy of the Font Software, to use, study, copy, merge, embed, modify, +redistribute, and sell modified and unmodified copies of the Font +Software, subject to the following conditions: + +1) Neither the Font Software nor any of its individual components, +in Original or Modified Versions, may be sold by itself. + +2) Original or Modified Versions of the Font Software may be bundled, +redistributed and/or sold with any software, provided that each copy +contains the above copyright notice and this license. These can be +included either as stand-alone text files, human-readable headers or +in the appropriate machine-readable metadata fields within text or +binary files as long as those fields can be easily viewed by the user. + +3) No Modified Version of the Font Software may use the Reserved Font +Name(s) unless explicit written permission is granted by the corresponding +Copyright Holder. This restriction only applies to the primary font name as +presented to the users. + +4) The name(s) of the Copyright Holder(s) or the Author(s) of the Font +Software shall not be used to promote, endorse or advertise any +Modified Version, except to acknowledge the contribution(s) of the +Copyright Holder(s) and the Author(s) or with their explicit written +permission. + +5) The Font Software, modified or unmodified, in part or in whole, +must be distributed entirely under this license, and must not be +distributed under any other license. The requirement for fonts to +remain under this license does not apply to any document created +using the Font Software. + +TERMINATION +This license becomes null and void if any of the above conditions are +not met. + +DISCLAIMER +THE FONT SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO ANY WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT +OF COPYRIGHT, PATENT, TRADEMARK, OR OTHER RIGHT. IN NO EVENT SHALL THE +COPYRIGHT HOLDER BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +INCLUDING ANY GENERAL, SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL +DAMAGES, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF THE USE OR INABILITY TO USE THE FONT SOFTWARE OR FROM +OTHER DEALINGS IN THE FONT SOFTWARE. diff --git a/plugins/ingest-attachment/licenses/commons-compress-1.26.1.jar.sha1 b/plugins/ingest-attachment/licenses/commons-compress-1.26.1.jar.sha1 deleted file mode 100644 index 912bda85de18a..0000000000000 --- a/plugins/ingest-attachment/licenses/commons-compress-1.26.1.jar.sha1 +++ /dev/null @@ -1 +0,0 @@ -44331c1130c370e726a2e1a3e6fba6d2558ef04a \ No newline at end of file diff --git a/plugins/ingest-attachment/licenses/commons-compress-1.28.0.jar.sha1 b/plugins/ingest-attachment/licenses/commons-compress-1.28.0.jar.sha1 new file mode 100644 index 0000000000000..5edae62aeeb5d --- /dev/null +++ b/plugins/ingest-attachment/licenses/commons-compress-1.28.0.jar.sha1 @@ -0,0 +1 @@ +e482f2c7a88dac3c497e96aa420b6a769f59c8d7 \ No newline at end of file diff --git a/plugins/ingest-attachment/licenses/fontbox-2.0.31.jar.sha1 b/plugins/ingest-attachment/licenses/fontbox-2.0.31.jar.sha1 deleted file mode 100644 index d45d45a66e072..0000000000000 --- a/plugins/ingest-attachment/licenses/fontbox-2.0.31.jar.sha1 +++ /dev/null @@ -1 +0,0 @@ -96999ecdb7324bf718b88724818fa62f81286c36 \ No newline at end of file diff --git a/plugins/ingest-attachment/licenses/fontbox-3.0.5.jar.sha1 b/plugins/ingest-attachment/licenses/fontbox-3.0.5.jar.sha1 new file mode 100644 index 0000000000000..241eda72e6dae --- /dev/null +++ b/plugins/ingest-attachment/licenses/fontbox-3.0.5.jar.sha1 @@ -0,0 +1 @@ +b4a068e1dba2b9832a108cdf6e9a3249680e3ce8 \ No newline at end of file diff --git a/plugins/ingest-attachment/licenses/jsoup-1.20.1.jar.sha1 b/plugins/ingest-attachment/licenses/jsoup-1.20.1.jar.sha1 new file mode 100644 index 0000000000000..9a2329562aae0 --- /dev/null +++ b/plugins/ingest-attachment/licenses/jsoup-1.20.1.jar.sha1 @@ -0,0 +1 @@ +769377896610be1736f8d6d51fc52a6042d1ce82 \ No newline at end of file diff --git a/plugins/ingest-attachment/licenses/jsoup-LICENSE.txt b/plugins/ingest-attachment/licenses/jsoup-LICENSE.txt new file mode 100644 index 0000000000000..e4bf2be9fb7f2 --- /dev/null +++ b/plugins/ingest-attachment/licenses/jsoup-LICENSE.txt @@ -0,0 +1,21 @@ +The MIT License + +Copyright (c) 2009-2025 Jonathan Hedley + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/plugins/ingest-attachment/licenses/tagsoup-NOTICE.txt b/plugins/ingest-attachment/licenses/jsoup-NOTICE.txt similarity index 100% rename from plugins/ingest-attachment/licenses/tagsoup-NOTICE.txt rename to plugins/ingest-attachment/licenses/jsoup-NOTICE.txt diff --git a/plugins/ingest-attachment/licenses/pdfbox-2.0.31.jar.sha1 b/plugins/ingest-attachment/licenses/pdfbox-2.0.31.jar.sha1 deleted file mode 100644 index fa256ed9a65d2..0000000000000 --- a/plugins/ingest-attachment/licenses/pdfbox-2.0.31.jar.sha1 +++ /dev/null @@ -1 +0,0 @@ -29b25053099bc30784a766ccb821417e06f4b8a1 \ No newline at end of file diff --git a/plugins/ingest-attachment/licenses/pdfbox-3.0.5.jar.sha1 b/plugins/ingest-attachment/licenses/pdfbox-3.0.5.jar.sha1 new file mode 100644 index 0000000000000..6a6fad5245aa2 --- /dev/null +++ b/plugins/ingest-attachment/licenses/pdfbox-3.0.5.jar.sha1 @@ -0,0 +1 @@ +c34109061c3a0d85d871d9edc469ac0682f81856 \ No newline at end of file diff --git a/plugins/ingest-attachment/licenses/pdfbox-io-3.0.5.jar.sha1 b/plugins/ingest-attachment/licenses/pdfbox-io-3.0.5.jar.sha1 new file mode 100644 index 0000000000000..e70c851dbd9c2 --- /dev/null +++ b/plugins/ingest-attachment/licenses/pdfbox-io-3.0.5.jar.sha1 @@ -0,0 +1 @@ +402151a8d1aa427ea879cc7160e9227e9f5088ba \ No newline at end of file diff --git a/plugins/ingest-attachment/licenses/tagsoup-LICENSE.txt b/plugins/ingest-attachment/licenses/pdfbox-io-LICENSE.txt similarity index 60% rename from plugins/ingest-attachment/licenses/tagsoup-LICENSE.txt rename to plugins/ingest-attachment/licenses/pdfbox-io-LICENSE.txt index 261eeb9e9f8b2..97553f24a432a 100644 --- a/plugins/ingest-attachment/licenses/tagsoup-LICENSE.txt +++ b/plugins/ingest-attachment/licenses/pdfbox-io-LICENSE.txt @@ -1,3 +1,4 @@ + Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ @@ -199,3 +200,145 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. + +EXTERNAL COMPONENTS + +Apache PDFBox includes a number of components with separate copyright notices +and license terms. Your use of these components is subject to the terms and +conditions of the following licenses. + +Contributions made to the original PDFBox and FontBox projects: + + Copyright (c) 2002-2007, www.pdfbox.org + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + 3. Neither the name of pdfbox; nor the names of its contributors may be + used to endorse or promote products derived from this software without + specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + SUCH DAMAGE. + +Adobe Font Metrics (AFM) for PDF Core 14 Fonts + + This file and the 14 PostScript(R) AFM files it accompanies may be used, + copied, and distributed for any purpose and without charge, with or without + modification, provided that all copyright notices are retained; that the + AFM files are not distributed without this file; that all modifications + to this file or any of the AFM files are prominently noted in the modified + file(s); and that this paragraph is not modified. Adobe Systems has no + responsibility or obligation to support the use of the AFM files. + +CMaps for PDF Fonts (http://opensource.adobe.com/wiki/display/cmap/Downloads) + + Copyright 1990-2009 Adobe Systems Incorporated. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + + Neither the name of Adobe Systems Incorporated nor the names of its + contributors may be used to endorse or promote products derived from this + software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + THE POSSIBILITY OF SUCH DAMAGE. + +PaDaF PDF/A preflight (http://sourceforge.net/projects/padaf) + + Copyright 2010 Atos Worldline SAS + + Licensed by Atos Worldline SAS under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + Atos Worldline SAS licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + +OSXAdapter + + Version: 2.0 + + Disclaimer: IMPORTANT: This Apple software is supplied to you by + Apple Inc. ("Apple") in consideration of your agreement to the + following terms, and your use, installation, modification or + redistribution of this Apple software constitutes acceptance of these + terms. If you do not agree with these terms, please do not use, + install, modify or redistribute this Apple software. + + In consideration of your agreement to abide by the following terms, and + subject to these terms, Apple grants you a personal, non-exclusive + license, under Apple's copyrights in this original Apple software (the + "Apple Software"), to use, reproduce, modify and redistribute the Apple + Software, with or without modifications, in source and/or binary forms; + provided that if you redistribute the Apple Software in its entirety and + without modifications, you must retain this notice and the following + text and disclaimers in all such redistributions of the Apple Software. + Neither the name, trademarks, service marks or logos of Apple Inc. + may be used to endorse or promote products derived from the Apple + Software without specific prior written permission from Apple. Except + as expressly stated in this notice, no other rights or licenses, express + or implied, are granted by Apple herein, including but not limited to + any patent rights that may be infringed by your derivative works or by + other works in which the Apple Software may be incorporated. + + The Apple Software is provided by Apple on an "AS IS" basis. APPLE + MAKES NO WARRANTIES, EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION + THE IMPLIED WARRANTIES OF NON-INFRINGEMENT, MERCHANTABILITY AND FITNESS + FOR A PARTICULAR PURPOSE, REGARDING THE APPLE SOFTWARE OR ITS USE AND + OPERATION ALONE OR IN COMBINATION WITH YOUR PRODUCTS. + + IN NO EVENT SHALL APPLE BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL + OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) ARISING IN ANY WAY OUT OF THE USE, REPRODUCTION, + MODIFICATION AND/OR DISTRIBUTION OF THE APPLE SOFTWARE, HOWEVER CAUSED + AND WHETHER UNDER THEORY OF CONTRACT, TORT (INCLUDING NEGLIGENCE), + STRICT LIABILITY OR OTHERWISE, EVEN IF APPLE HAS BEEN ADVISED OF THE + POSSIBILITY OF SUCH DAMAGE. + + Copyright (C) 2003-2007 Apple, Inc., All Rights Reserved diff --git a/plugins/ingest-attachment/licenses/pdfbox-io-NOTICE.txt b/plugins/ingest-attachment/licenses/pdfbox-io-NOTICE.txt new file mode 100644 index 0000000000000..3c85708256104 --- /dev/null +++ b/plugins/ingest-attachment/licenses/pdfbox-io-NOTICE.txt @@ -0,0 +1,22 @@ +Apache PDFBox +Copyright 2014 The Apache Software Foundation + +This product includes software developed at +The Apache Software Foundation (http://www.apache.org/). + +Based on source code originally developed in the PDFBox and +FontBox projects. + +Copyright (c) 2002-2007, www.pdfbox.org + +Based on source code originally developed in the PaDaF project. +Copyright (c) 2010 Atos Worldline SAS + +Includes the Adobe Glyph List +Copyright 1997, 1998, 2002, 2007, 2010 Adobe Systems Incorporated. + +Includes the Zapf Dingbats Glyph List +Copyright 2002, 2010 Adobe Systems Incorporated. + +Includes OSXAdapter +Copyright (C) 2003-2007 Apple, Inc., All Rights Reserved diff --git a/plugins/ingest-attachment/licenses/tagsoup-1.2.1.jar.sha1 b/plugins/ingest-attachment/licenses/tagsoup-1.2.1.jar.sha1 deleted file mode 100644 index 5d227b11a0fa6..0000000000000 --- a/plugins/ingest-attachment/licenses/tagsoup-1.2.1.jar.sha1 +++ /dev/null @@ -1 +0,0 @@ -5584627487e984c03456266d3f8802eb85a9ce97 diff --git a/plugins/ingest-attachment/licenses/tika-core-2.9.2.jar.sha1 b/plugins/ingest-attachment/licenses/tika-core-2.9.2.jar.sha1 deleted file mode 100644 index 80635a63d29fe..0000000000000 --- a/plugins/ingest-attachment/licenses/tika-core-2.9.2.jar.sha1 +++ /dev/null @@ -1 +0,0 @@ -796a21391780339e3d4862626339b49df170024e \ No newline at end of file diff --git a/plugins/ingest-attachment/licenses/tika-core-3.2.2.jar.sha1 b/plugins/ingest-attachment/licenses/tika-core-3.2.2.jar.sha1 new file mode 100644 index 0000000000000..01df6be02361e --- /dev/null +++ b/plugins/ingest-attachment/licenses/tika-core-3.2.2.jar.sha1 @@ -0,0 +1 @@ +f1f16ecac7a81e145051f906927ea6b58ce7e914 \ No newline at end of file diff --git a/plugins/ingest-attachment/licenses/tika-langdetect-optimaize-2.9.2.jar.sha1 b/plugins/ingest-attachment/licenses/tika-langdetect-optimaize-2.9.2.jar.sha1 deleted file mode 100644 index a4bb6d48c6a08..0000000000000 --- a/plugins/ingest-attachment/licenses/tika-langdetect-optimaize-2.9.2.jar.sha1 +++ /dev/null @@ -1 +0,0 @@ -7a48a287e464b456a85c79f318d7bad7db201518 \ No newline at end of file diff --git a/plugins/ingest-attachment/licenses/tika-langdetect-optimaize-3.2.2.jar.sha1 b/plugins/ingest-attachment/licenses/tika-langdetect-optimaize-3.2.2.jar.sha1 new file mode 100644 index 0000000000000..b692ab8befa3b --- /dev/null +++ b/plugins/ingest-attachment/licenses/tika-langdetect-optimaize-3.2.2.jar.sha1 @@ -0,0 +1 @@ +3ee2907773fe2aaa1013829e00cd62778d6a2ff9 \ No newline at end of file diff --git a/plugins/ingest-attachment/licenses/tika-parser-apple-module-2.9.2.jar.sha1 b/plugins/ingest-attachment/licenses/tika-parser-apple-module-2.9.2.jar.sha1 deleted file mode 100644 index dbaee880d1251..0000000000000 --- a/plugins/ingest-attachment/licenses/tika-parser-apple-module-2.9.2.jar.sha1 +++ /dev/null @@ -1 +0,0 @@ -758dac27c246c51b019562bab7e266d2da6a6e01 \ No newline at end of file diff --git a/plugins/ingest-attachment/licenses/tika-parser-apple-module-3.2.2.jar.sha1 b/plugins/ingest-attachment/licenses/tika-parser-apple-module-3.2.2.jar.sha1 new file mode 100644 index 0000000000000..7ef86ac18757b --- /dev/null +++ b/plugins/ingest-attachment/licenses/tika-parser-apple-module-3.2.2.jar.sha1 @@ -0,0 +1 @@ +fde21727740a39beead899c9ca6e642f92d86e3a \ No newline at end of file diff --git a/plugins/ingest-attachment/licenses/tika-parser-html-module-2.9.2.jar.sha1 b/plugins/ingest-attachment/licenses/tika-parser-html-module-2.9.2.jar.sha1 deleted file mode 100644 index b4806746301ef..0000000000000 --- a/plugins/ingest-attachment/licenses/tika-parser-html-module-2.9.2.jar.sha1 +++ /dev/null @@ -1 +0,0 @@ -47f6a4c46b92616d14e82cd7ad4d05cb43077b83 \ No newline at end of file diff --git a/plugins/ingest-attachment/licenses/tika-parser-html-module-3.2.2.jar.sha1 b/plugins/ingest-attachment/licenses/tika-parser-html-module-3.2.2.jar.sha1 new file mode 100644 index 0000000000000..351a9d6963000 --- /dev/null +++ b/plugins/ingest-attachment/licenses/tika-parser-html-module-3.2.2.jar.sha1 @@ -0,0 +1 @@ +e6acd314da558703977a681661c215f3ef92dbbd \ No newline at end of file diff --git a/plugins/ingest-attachment/licenses/tika-parser-microsoft-module-2.9.2.jar.sha1 b/plugins/ingest-attachment/licenses/tika-parser-microsoft-module-2.9.2.jar.sha1 deleted file mode 100644 index da1ae42bac652..0000000000000 --- a/plugins/ingest-attachment/licenses/tika-parser-microsoft-module-2.9.2.jar.sha1 +++ /dev/null @@ -1 +0,0 @@ -235a20823c02c699ce3d57f3d6b9550db05d91a9 \ No newline at end of file diff --git a/plugins/ingest-attachment/licenses/tika-parser-microsoft-module-3.2.2.jar.sha1 b/plugins/ingest-attachment/licenses/tika-parser-microsoft-module-3.2.2.jar.sha1 new file mode 100644 index 0000000000000..bcc475b3f4c1d --- /dev/null +++ b/plugins/ingest-attachment/licenses/tika-parser-microsoft-module-3.2.2.jar.sha1 @@ -0,0 +1 @@ +41ff68abccde91ab17d7b181eb7a5fccf16e8b5c \ No newline at end of file diff --git a/plugins/ingest-attachment/licenses/tika-parser-miscoffice-module-2.9.2.jar.sha1 b/plugins/ingest-attachment/licenses/tika-parser-miscoffice-module-2.9.2.jar.sha1 deleted file mode 100644 index 7ceed9e1643b8..0000000000000 --- a/plugins/ingest-attachment/licenses/tika-parser-miscoffice-module-2.9.2.jar.sha1 +++ /dev/null @@ -1 +0,0 @@ -7688a4220d07c32b505230479f957cd495c0bef2 \ No newline at end of file diff --git a/plugins/ingest-attachment/licenses/tika-parser-miscoffice-module-3.2.2.jar.sha1 b/plugins/ingest-attachment/licenses/tika-parser-miscoffice-module-3.2.2.jar.sha1 new file mode 100644 index 0000000000000..a7ac03630fe9c --- /dev/null +++ b/plugins/ingest-attachment/licenses/tika-parser-miscoffice-module-3.2.2.jar.sha1 @@ -0,0 +1 @@ +d4078f950ca55c5235cdfcad744235242f9edc05 \ No newline at end of file diff --git a/plugins/ingest-attachment/licenses/tika-parser-pdf-module-2.9.2.jar.sha1 b/plugins/ingest-attachment/licenses/tika-parser-pdf-module-2.9.2.jar.sha1 deleted file mode 100644 index e780c1b92d525..0000000000000 --- a/plugins/ingest-attachment/licenses/tika-parser-pdf-module-2.9.2.jar.sha1 +++ /dev/null @@ -1 +0,0 @@ -4d0f0e3f6eff184040402094f4fabbb3c5c7d09f \ No newline at end of file diff --git a/plugins/ingest-attachment/licenses/tika-parser-pdf-module-3.2.2.jar.sha1 b/plugins/ingest-attachment/licenses/tika-parser-pdf-module-3.2.2.jar.sha1 new file mode 100644 index 0000000000000..c9baba749d403 --- /dev/null +++ b/plugins/ingest-attachment/licenses/tika-parser-pdf-module-3.2.2.jar.sha1 @@ -0,0 +1 @@ +a972d70ef0762b460c048c5e0e8a46c46bb170aa \ No newline at end of file diff --git a/plugins/ingest-attachment/licenses/tika-parser-text-module-2.9.2.jar.sha1 b/plugins/ingest-attachment/licenses/tika-parser-text-module-2.9.2.jar.sha1 deleted file mode 100644 index 6e56fcffc5f88..0000000000000 --- a/plugins/ingest-attachment/licenses/tika-parser-text-module-2.9.2.jar.sha1 +++ /dev/null @@ -1 +0,0 @@ -b3a93e538ba6cb4066aba96d629febf181ec9f92 \ No newline at end of file diff --git a/plugins/ingest-attachment/licenses/tika-parser-text-module-3.2.2.jar.sha1 b/plugins/ingest-attachment/licenses/tika-parser-text-module-3.2.2.jar.sha1 new file mode 100644 index 0000000000000..c84219d17252b --- /dev/null +++ b/plugins/ingest-attachment/licenses/tika-parser-text-module-3.2.2.jar.sha1 @@ -0,0 +1 @@ +a19be47ecca1a061349dc2d019ab6f2741ff1dee \ No newline at end of file diff --git a/plugins/ingest-attachment/licenses/tika-parser-xml-module-2.9.2.jar.sha1 b/plugins/ingest-attachment/licenses/tika-parser-xml-module-2.9.2.jar.sha1 deleted file mode 100644 index 27062077b92bf..0000000000000 --- a/plugins/ingest-attachment/licenses/tika-parser-xml-module-2.9.2.jar.sha1 +++ /dev/null @@ -1 +0,0 @@ -ff707716c0c4748ffeb21996aefa8d269b3eab5b \ No newline at end of file diff --git a/plugins/ingest-attachment/licenses/tika-parser-xml-module-3.2.2.jar.sha1 b/plugins/ingest-attachment/licenses/tika-parser-xml-module-3.2.2.jar.sha1 new file mode 100644 index 0000000000000..e63b0f71f2d19 --- /dev/null +++ b/plugins/ingest-attachment/licenses/tika-parser-xml-module-3.2.2.jar.sha1 @@ -0,0 +1 @@ +9dd2f1c52ab2663600e82dae3a8003ce6ede372f \ No newline at end of file diff --git a/plugins/ingest-attachment/licenses/tika-parser-xmp-commons-2.9.2.jar.sha1 b/plugins/ingest-attachment/licenses/tika-parser-xmp-commons-2.9.2.jar.sha1 deleted file mode 100644 index 396e2655b14db..0000000000000 --- a/plugins/ingest-attachment/licenses/tika-parser-xmp-commons-2.9.2.jar.sha1 +++ /dev/null @@ -1 +0,0 @@ -69104107ff85194df5acf682178128771863e442 \ No newline at end of file diff --git a/plugins/ingest-attachment/licenses/tika-parser-xmp-commons-3.2.2.jar.sha1 b/plugins/ingest-attachment/licenses/tika-parser-xmp-commons-3.2.2.jar.sha1 new file mode 100644 index 0000000000000..98b09c1785d78 --- /dev/null +++ b/plugins/ingest-attachment/licenses/tika-parser-xmp-commons-3.2.2.jar.sha1 @@ -0,0 +1 @@ +f1dfa02a2c672153013d44501e0c21d5682aa822 \ No newline at end of file diff --git a/plugins/ingest-attachment/licenses/tika-parser-zip-commons-2.9.2.jar.sha1 b/plugins/ingest-attachment/licenses/tika-parser-zip-commons-2.9.2.jar.sha1 deleted file mode 100644 index bda62033e4e8c..0000000000000 --- a/plugins/ingest-attachment/licenses/tika-parser-zip-commons-2.9.2.jar.sha1 +++ /dev/null @@ -1 +0,0 @@ -2fcea85a56f93a5c0cb81f3d6dd8673f3d81c598 \ No newline at end of file diff --git a/plugins/ingest-attachment/licenses/tika-parser-zip-commons-3.2.2.jar.sha1 b/plugins/ingest-attachment/licenses/tika-parser-zip-commons-3.2.2.jar.sha1 new file mode 100644 index 0000000000000..ac860449a84dd --- /dev/null +++ b/plugins/ingest-attachment/licenses/tika-parser-zip-commons-3.2.2.jar.sha1 @@ -0,0 +1 @@ +d46b71ea5697f575c3febfd7343e5d8b2c338bd5 \ No newline at end of file diff --git a/plugins/ingest-attachment/licenses/tika-parsers-standard-package-2.9.2.jar.sha1 b/plugins/ingest-attachment/licenses/tika-parsers-standard-package-2.9.2.jar.sha1 deleted file mode 100644 index bb76974b6344e..0000000000000 --- a/plugins/ingest-attachment/licenses/tika-parsers-standard-package-2.9.2.jar.sha1 +++ /dev/null @@ -1 +0,0 @@ -c8408deb51fa617ef4e912b4d161712e695d3a29 \ No newline at end of file diff --git a/plugins/ingest-attachment/licenses/tika-parsers-standard-package-3.2.2.jar.sha1 b/plugins/ingest-attachment/licenses/tika-parsers-standard-package-3.2.2.jar.sha1 new file mode 100644 index 0000000000000..f6e9d188908cd --- /dev/null +++ b/plugins/ingest-attachment/licenses/tika-parsers-standard-package-3.2.2.jar.sha1 @@ -0,0 +1 @@ +c91fb85f5ee46e2c1f1e3399b04efb9d1ff85485 \ No newline at end of file diff --git a/plugins/ingest-attachment/src/main/java/org/opensearch/ingest/attachment/TikaImpl.java b/plugins/ingest-attachment/src/main/java/org/opensearch/ingest/attachment/TikaImpl.java index d999d20537485..068f1ae5d6d78 100644 --- a/plugins/ingest-attachment/src/main/java/org/opensearch/ingest/attachment/TikaImpl.java +++ b/plugins/ingest-attachment/src/main/java/org/opensearch/ingest/attachment/TikaImpl.java @@ -32,6 +32,16 @@ package org.opensearch.ingest.attachment; +import org.apache.fontbox.FontBoxFont; +import org.apache.fontbox.ttf.TTFParser; +import org.apache.fontbox.ttf.TrueTypeFont; +import org.apache.pdfbox.io.RandomAccessReadBuffer; +import org.apache.pdfbox.pdmodel.font.CIDFontMapping; +import org.apache.pdfbox.pdmodel.font.FontMapper; +import org.apache.pdfbox.pdmodel.font.FontMappers; +import org.apache.pdfbox.pdmodel.font.FontMapping; +import org.apache.pdfbox.pdmodel.font.PDCIDSystemInfo; +import org.apache.pdfbox.pdmodel.font.PDFontDescriptor; import org.apache.tika.Tika; import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; @@ -47,6 +57,7 @@ import java.io.ByteArrayInputStream; import java.io.IOException; +import java.io.InputStream; import java.io.UncheckedIOException; import java.lang.reflect.ReflectPermission; import java.net.URISyntaxException; @@ -75,6 +86,44 @@ */ final class TikaImpl { + static { + /* + * Stop PDFBox from consulting the OS for fonts at all, use classpath instead with dummy fonts because font + * does not matter for ingestion + */ + FontMappers.set(new FontMapper() { + @Override + public FontMapping getTrueTypeFont(String baseFont, PDFontDescriptor fd) { + try (InputStream in = TikaImpl.class.getResourceAsStream("/fonts/Roboto-Regular.ttf")) { + if (in == null) return new FontMapping<>(null, true); + byte[] bytes = in.readAllBytes(); + TrueTypeFont ttf = new TTFParser().parse(new RandomAccessReadBuffer(bytes)); + return new FontMapping<>(ttf, true); + } catch (IOException e) { + return new FontMapping<>(null, true); + } + } + + @Override + public FontMapping getFontBoxFont(String baseFont, PDFontDescriptor fd) { + try (InputStream in = TikaImpl.class.getResourceAsStream("/fonts/Roboto-Regular.ttf")) { + if (in == null) return new FontMapping<>(null, true); + byte[] bytes = in.readAllBytes(); + TrueTypeFont ttf = new TTFParser().parse(new RandomAccessReadBuffer(bytes)); + return new FontMapping<>(ttf, true); + } catch (IOException e) { + return new FontMapping<>(null, true); + } + } + + @Override + public CIDFontMapping getCIDFont(String baseFont, PDFontDescriptor fd, PDCIDSystemInfo cid) { + // No CID substitutions from the OS either; signal "fallback only". + return new CIDFontMapping(null, null, true); + } + }); + } + /** Exclude some formats */ private static final Set EXCLUDES = new HashSet<>( Arrays.asList( @@ -91,7 +140,7 @@ final class TikaImpl { /** subset of parsers for types we support */ private static final Parser PARSERS[] = new Parser[] { // documents - new org.apache.tika.parser.html.HtmlParser(), + new org.apache.tika.parser.html.JSoupParser(), new org.apache.tika.parser.pdf.PDFParser(), new org.apache.tika.parser.txt.TXTParser(), new org.apache.tika.parser.microsoft.rtf.RTFParser(), diff --git a/plugins/ingest-attachment/src/main/resources/fonts/Roboto-Regular.ttf b/plugins/ingest-attachment/src/main/resources/fonts/Roboto-Regular.ttf new file mode 100644 index 0000000000000..7e3bb2f8ce7ae Binary files /dev/null and b/plugins/ingest-attachment/src/main/resources/fonts/Roboto-Regular.ttf differ diff --git a/plugins/ingest-attachment/src/test/resources/org/opensearch/ingest/attachment/test/.checksums b/plugins/ingest-attachment/src/test/resources/org/opensearch/ingest/attachment/test/.checksums index 227d7d833a231..cbbf7dc49bd8e 100644 --- a/plugins/ingest-attachment/src/test/resources/org/opensearch/ingest/attachment/test/.checksums +++ b/plugins/ingest-attachment/src/test/resources/org/opensearch/ingest/attachment/test/.checksums @@ -3,7 +3,7 @@ "testWORD_1img.docx": "367e2ade13ca3c19bcd8a323e21d51d407e017ac", "testMasterFooter.odp": "bcc59df70699c739423a50e362c722b81ae76498", "testTXTNonASCIIUTF8.txt": "1ef514431ca8d838f11e99f8e4a0637730b77aa0", - "EmbeddedOutlook.docx": "c544a6765c19ba11b0bf3edb55c79e1bd8565c6e", + "EmbeddedOutlook.docx": "770c14c1f8d1cb3ff431a6ea7d0cbd9f5091f1f5", "testWORD_override_list_numbering.docx": "4e892319b921322916225def763f451e4bbb4e16", "testTextBoxes.key": "b01581d5bd2483ce649a1a1406136359f4b93167", "testPPT_masterText.pptx": "9fee8337b76dc3e196f4554dcde22b9dd1c3b3e8", @@ -64,9 +64,9 @@ "testRTFTableCellSeparation2.rtf": "62782ca40ff0ed6c3ba90f8055ee724b44af203f", "testPagesHeadersFootersRomanLower.pages": "2410fc803907001eb39c201ad4184b243e271c6d", "headerPic.docx": "c704bb648feac7975dff1024a5f762325be7cbc2", - "testHTMLNoisyMetaEncoding_4.html": "630e14e3495a78580c4e26fa3bbe3123ccf4fd8a", + "testHTMLNoisyMetaEncoding_4.html": "83d08bacf04d72f04b9ac67df81e9e63a891d744", "testRTFBoldItalic.rtf": "0475d224078682cf3f9f3f4cbc14a63456c5a0d8", - "test-outlook.msg": "1f202fc11a873e305d5b4d4607409f3f734065ec", + "test-outlook.msg": "ef14d2bbbe167b5d3500dcab3950cfa22cd94665", "testRTFVarious.rtf": "bf6ea9cf57886e680c5e6743a66a12b950a09083", "testXHTML.html": "c6da900f81c1c550518e65d579d3dd62dd7c5c0c", "EmbeddedPDF.docx": "454476bdf4a968189a6f53e75c146382bf58a434", @@ -101,13 +101,13 @@ "testWORD_override_list_numbering.doc": "60e47a3e71ba08af20af96131d61740a1f0bafa3", "testPDF_twoAuthors.pdf": "c5f0296cc21f9ae99ceb649b561c55f99d7d9452", "testPDF_Version.10.x.pdf": "03b60dfc8c103dbabeedfd682e979f96dd8983a2", - "testHTMLNoisyMetaEncoding_2.html": "630e14e3495a78580c4e26fa3bbe3123ccf4fd8a", + "testHTMLNoisyMetaEncoding_2.html": "83d08bacf04d72f04b9ac67df81e9e63a891d744", "testFooter.odt": "cd5d0fcbcf48d6f005d087c47d00e84f39bcc321", "testPPT.pptm": "71333ef84f7825d8ad6aba2ba993d04b4bab41c6", "testPPT_various.ppt": "399e27a9893284f106dc44f15b5e636454db681e", "testRTFListMicrosoftWord.rtf": "0303eb3e2f30530621a7a407847b759a3b21467e", "testWORD_bold_character_runs2.doc": "f10e562d8825ec2e17e0d9f58646f8084a658cfa", - "boilerplate-whitespace.html": "a9372bc75d7d84cbcbb0bce68fcaed73ad8ef52c", + "boilerplate-whitespace.html": "bf1fd3ffcf798afd688254bbc899e388eda9e546", "testEXCEL_95.xls": "20d9b9b0f3aecd28607516b4b837c8bab3524b6c", "testPPT_embedded_two_slides.pptx": "", "testPDF_bookmarks.pdf": "5fc486c443511452db4f1aa6530714c6aa49c831", @@ -121,14 +121,14 @@ "testPDF_Version.4.x.pdf": "03b60dfc8c103dbabeedfd682e979f96dd8983a2", "testBinControlWord.rtf": "ef858fbb7584ea7f92ffed8d0a08c1cc35ffee07", "testWORD_null_style.docx": "0be9dcfb83423c78a06af514ec21e4e7770ec48e", - "test-outlook2003.msg": "bb3c35eb7e95d657d7977c1d3d52862734f9f329", + "test-outlook2003.msg": "b9c21661a59254c8d6a9b665e28070757a354cbe", "testPDFVarious.pdf": "c66bbbacb10dd27430f7d0bed9518e75793cedae", - "testHTMLNoisyMetaEncoding_3.html": "630e14e3495a78580c4e26fa3bbe3123ccf4fd8a", + "testHTMLNoisyMetaEncoding_3.html": "83d08bacf04d72f04b9ac67df81e9e63a891d744", "testRTFCorruptListOverride.rtf": "116a782d02a7f25010a15cbbb189bf98e6b89855", "testEXCEL_custom_props.xls": "b5584d9b13ab1566ce539238dc75e7eb3449ba7f", "testPDF_Version.7.x.pdf": "03b60dfc8c103dbabeedfd682e979f96dd8983a2", "testPDFEmbeddingAndEmbedded.docx": "e7b648adb15cd16cdd84437c2b9524a8eeb213e4", - "testHTMLNoisyMetaEncoding_1.html": "630e14e3495a78580c4e26fa3bbe3123ccf4fd8a", + "testHTMLNoisyMetaEncoding_1.html": "83d08bacf04d72f04b9ac67df81e9e63a891d744", "testWORD_3imgs.doc": "818aa8c6c44dd78c49100c3c38e95abdf3812981", "testRTFEmbeddedLink.rtf": "2720ffb5ff3a6bbb2c5c1cb43fb4922362ed788a", "testKeynote.key": "11387b59fc6339bb73653fcbb26d387521b98ec9", @@ -156,7 +156,7 @@ "testWORD_custom_props.doc": "e7a737a5237a6aa9c6b3fc677eb8fa65c30d6dfe", "testPDF_Version.11.x.PDFA-1b.pdf": "71853c6197a6a7f222db0f1978c7cb232b87c5ee", "testAnnotations.pdf": "5f599e7916198540e1b52c3e472a525f50fd45f6", - "tika434.html": "7d74122631f52f003a48018cc376026ccd8d984e", + "tika434.html": "51cafe6636423e37c05e676cb1454e72961b8f04", "testPagesHeadersFootersAlphaLower.pages": "fc1d766908134ff4689fa63fa3e91c3e9b08d975", "testRTFRegularImages.rtf": "756b1db45cb05357ceaf9c8efcf0b76e3913e190", "testRTFUmlautSpaces2.rtf": "1fcd029357062241d74d789e93477c101ff24e3f", @@ -166,7 +166,7 @@ "testMasterSlideTable.key": "1d61e2fa3c3f3615500c7f72f62971391b9e9a2f", "testWORD_various.doc": "8cbdf1a4e0d78471eb90403612c4e92866acf0cb", "testEXCEL_textbox.xlsx": "1e81121e91e58a74d838e414ae0fc0055a4b4100", - "big-preamble.html": "a9d759b46b6c6c1857d0d89c3a75ee2f3ace70c9", + "big-preamble.html": "edecdb8304a31bca1a71faab2153fa133989e6d8", "testWORD.docx": "f72140bef19475e950e56084d1ab1cb926697b19", "testComment.rtf": "f6351d0f1f20c4ee0fff70adca6abbc6e638610e", "testRTFUnicodeUCNControlWordCharacterDoubling.rtf": "3e6f2f38682e38ffc96a476ca51bec2291a27fa7", @@ -190,7 +190,7 @@ "testRTFIgnoredControlWord.rtf": "1eb6a2f2fd32b1bb4227c0c02a35cb6027d9ec8c", "testComment.xls": "4de962f16452159ce302fc4a412b06a06cf9a0f6", "testPPT.ppsm": "71333ef84f7825d8ad6aba2ba993d04b4bab41c6", - "boilerplate.html": "b3558f02c3179e4aeeb6057594d87bda79964e7b", + "boilerplate.html": "f1e3c82a4f16f67590a5afe4b64d90d98330d216", "testEXCEL_embeded.xls": "", "testEXCEL.xlsx": "", "testPPT_2imgs.ppt": "9a68072ffcf171389e78cf8bc018c4b568a6202d", diff --git a/plugins/ingestion-kafka/build.gradle b/plugins/ingestion-kafka/build.gradle index 9ba91190944dc..abd1b1a5c038c 100644 --- a/plugins/ingestion-kafka/build.gradle +++ b/plugins/ingestion-kafka/build.gradle @@ -41,6 +41,7 @@ dependencies { testImplementation "org.testcontainers:kafka:${versions.testcontainers}" testImplementation "org.rnorth.duct-tape:duct-tape:${versions.ducttape}" testImplementation "org.apache.commons:commons-compress:${versions.commonscompress}" + testImplementation "org.apache.commons:commons-lang3:${versions.commonslang}" testImplementation "commons-io:commons-io:${versions.commonsio}" testImplementation 'org.awaitility:awaitility:4.2.0' } diff --git a/plugins/repository-hdfs/licenses/commons-compress-1.26.1.jar.sha1 b/plugins/repository-hdfs/licenses/commons-compress-1.26.1.jar.sha1 deleted file mode 100644 index 912bda85de18a..0000000000000 --- a/plugins/repository-hdfs/licenses/commons-compress-1.26.1.jar.sha1 +++ /dev/null @@ -1 +0,0 @@ -44331c1130c370e726a2e1a3e6fba6d2558ef04a \ No newline at end of file diff --git a/plugins/repository-hdfs/licenses/commons-compress-1.28.0.jar.sha1 b/plugins/repository-hdfs/licenses/commons-compress-1.28.0.jar.sha1 new file mode 100644 index 0000000000000..5edae62aeeb5d --- /dev/null +++ b/plugins/repository-hdfs/licenses/commons-compress-1.28.0.jar.sha1 @@ -0,0 +1 @@ +e482f2c7a88dac3c497e96aa420b6a769f59c8d7 \ No newline at end of file