From fb121569593a10c67baaaa1363b27c83a1db70e0 Mon Sep 17 00:00:00 2001 From: katsuhisa yuasa Date: Mon, 20 Sep 2021 23:30:12 +0900 Subject: [PATCH 1/4] =?UTF-8?q?uchardet.dll=20=E3=81=8C=E5=AD=98=E5=9C=A8?= =?UTF-8?q?=E3=81=97=E3=81=9F=E3=82=89=E3=81=9D=E3=82=8C=E3=82=92=E4=BD=BF?= =?UTF-8?q?=E3=81=A3=E3=81=A6=E6=96=87=E5=AD=97=E3=82=A8=E3=83=B3=E3=82=B3?= =?UTF-8?q?=E3=83=BC=E3=83=87=E3=82=A3=E3=83=B3=E3=82=B0=E3=81=AE=E6=A4=9C?= =?UTF-8?q?=E5=87=BA=E3=81=8C=E8=A1=8C=E3=82=8F=E3=82=8C=E3=82=8B=E3=82=88?= =?UTF-8?q?=E3=81=86=E3=81=AB=E5=87=A6=E7=90=86=E8=BF=BD=E5=8A=A0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- sakura/sakura.vcxproj | 2 + sakura/sakura.vcxproj.filters | 6 ++ sakura_core/charset/icu4c/CharsetDetector.cpp | 80 +++++++++++++------ sakura_core/charset/icu4c/CharsetDetector.h | 6 +- sakura_core/extmodule/CUchardet.cpp | 58 ++++++++++++++ sakura_core/extmodule/CUchardet.h | 60 ++++++++++++++ 6 files changed, 188 insertions(+), 24 deletions(-) create mode 100644 sakura_core/extmodule/CUchardet.cpp create mode 100644 sakura_core/extmodule/CUchardet.h diff --git a/sakura/sakura.vcxproj b/sakura/sakura.vcxproj index 7d606076de..2fdc7ff9f2 100644 --- a/sakura/sakura.vcxproj +++ b/sakura/sakura.vcxproj @@ -415,6 +415,7 @@ + @@ -783,6 +784,7 @@ + diff --git a/sakura/sakura.vcxproj.filters b/sakura/sakura.vcxproj.filters index 659a552187..26e57cd9d7 100644 --- a/sakura/sakura.vcxproj.filters +++ b/sakura/sakura.vcxproj.filters @@ -1112,6 +1112,9 @@ Cpp Source Files\recent + + Cpp Source Files\extmodule + @@ -2306,6 +2309,9 @@ Cpp Source Files\convert + + Cpp Source Files\extmodule + diff --git a/sakura_core/charset/icu4c/CharsetDetector.cpp b/sakura_core/charset/icu4c/CharsetDetector.cpp index 78c3f7400f..50bb01bad9 100644 --- a/sakura_core/charset/icu4c/CharsetDetector.cpp +++ b/sakura_core/charset/icu4c/CharsetDetector.cpp @@ -28,8 +28,10 @@ CharsetDetector::CharsetDetector() noexcept : _icuin() , _csd(nullptr) + , _ud(nullptr) { _icuin.InitDll(); + _uchardet.InitDll(); } CharsetDetector::~CharsetDetector() noexcept @@ -37,32 +39,13 @@ CharsetDetector::~CharsetDetector() noexcept if (_icuin.IsAvailable()) { _icuin.ucsdet_close(_csd); } + if (_uchardet.IsAvailable()) { + _uchardet.uchardet_delete(_ud); + } } -ECodeType CharsetDetector::Detect(const std::string_view& bytes) +static ECodeType name2code(std::string_view name) { - UErrorCode status = U_ZERO_ERROR; - - _csd = _icuin.ucsdet_open(&status); - if (status != U_ZERO_ERROR) { - return CODE_ERROR; - } - - _icuin.ucsdet_setText(_csd, bytes.data(), bytes.length(), &status); - if (status != U_ZERO_ERROR) { - return CODE_ERROR; - } - - const auto csm = _icuin.ucsdet_detect(_csd, &status); - if (status != U_ZERO_ERROR) { - return CODE_ERROR; - } - - std::string_view name = _icuin.ucsdet_getName(csm, &status); - if (status != U_ZERO_ERROR) { - return CODE_ERROR; - } - // 文字セット名⇒サクラエディタ内部コードの変換 if (name == "UTF-8") return CODE_UTF8; if (name == "SHIFT_JIS") return CODE_SJIS; @@ -72,6 +55,57 @@ ECodeType CharsetDetector::Detect(const std::string_view& bytes) if (name == "ISO-2022-JP") return CODE_JIS; if (name == "UTF-7") return CODE_UTF7; if (name == "ISO-8859-1") return CODE_LATIN1; + // ここから下は数が多いのでどうしたものか… + // https://www.freedesktop.org/wiki/Software/uchardet/ + // https://docs.microsoft.com/en-us/windows/win32/intl/code-page-identifiers + if (name == "GB2312") return (ECodeType)936; + if (name == "BIG5") return (ECodeType)950; + if (name == "ISO-2022-KR") return (ECodeType)50225; + if (name == "GB18030") return (ECodeType)54936; + return CODE_ERROR; +} + +ECodeType CharsetDetector::Detect(const std::string_view& bytes) +{ + if (_icuin.IsAvailable()) { + UErrorCode status = U_ZERO_ERROR; + _csd = _icuin.ucsdet_open(&status); + if (status != U_ZERO_ERROR) { + return CODE_ERROR; + } + + _icuin.ucsdet_setText(_csd, bytes.data(), bytes.length(), &status); + if (status != U_ZERO_ERROR) { + return CODE_ERROR; + } + + const auto csm = _icuin.ucsdet_detect(_csd, &status); + if (status != U_ZERO_ERROR) { + return CODE_ERROR; + } + std::string_view name = _icuin.ucsdet_getName(csm, &status); + if (status != U_ZERO_ERROR) { + return CODE_ERROR; + } + return name2code(name); + } + if (_uchardet.IsAvailable()) { + if (!_ud) { + _ud = _uchardet.uchardet_new(); + } + if (!_ud) { + return CODE_ERROR; + } + int ret = _uchardet.uchardet_handle_data(_ud, bytes.data(), bytes.length()); + if (ret != 0) { + return CODE_ERROR; + } + _uchardet.uchardet_data_end(_ud); + std::string_view name = _uchardet.uchardet_get_charset(_ud); + auto code = name2code(name); + _uchardet.uchardet_reset(_ud); + return code; + } return CODE_ERROR; } diff --git a/sakura_core/charset/icu4c/CharsetDetector.h b/sakura_core/charset/icu4c/CharsetDetector.h index 23699a2601..1f36ff7d47 100644 --- a/sakura_core/charset/icu4c/CharsetDetector.h +++ b/sakura_core/charset/icu4c/CharsetDetector.h @@ -29,6 +29,7 @@ #include #include "extmodule/CIcu4cI18n.h" +#include "extmodule/CUchardet.h" /*! * @brief 文字コード検出クラス @@ -38,12 +39,15 @@ class CharsetDetector final CIcu4cI18n _icuin; UCharsetDetector* _csd; + CUchardet _uchardet; + uchardet_t _ud; + public: CharsetDetector() noexcept; ~CharsetDetector() noexcept; bool IsAvailable() const noexcept { - return _icuin.IsAvailable(); + return _icuin.IsAvailable() || _uchardet.IsAvailable(); } ECodeType Detect(const std::string_view& bytes); diff --git a/sakura_core/extmodule/CUchardet.cpp b/sakura_core/extmodule/CUchardet.cpp new file mode 100644 index 0000000000..5bcac38612 --- /dev/null +++ b/sakura_core/extmodule/CUchardet.cpp @@ -0,0 +1,58 @@ +/*! @file */ +/* + Copyright (C) 2018-2021, Sakura Editor Organization + + This software is provided 'as-is', without any express or implied + warranty. In no event will the authors be held liable for any damages + arising from the use of this software. + + Permission is granted to anyone to use this software for any purpose, + including commercial applications, and to alter it and redistribute it + freely, subject to the following restrictions: + + 1. The origin of this software must not be misrepresented; + you must not claim that you wrote the original software. + If you use this software in a product, an acknowledgment + in the product documentation would be appreciated but is + not required. + + 2. Altered source versions must be plainly marked as such, + and must not be misrepresented as being the original software. + + 3. This notice may not be removed or altered from any source + distribution. +*/ +#include "StdAfx.h" +#include "CUchardet.h" + +/*! + * @brief DLLの名前を返す + */ +LPCWSTR CUchardet::GetDllNameImp( [[maybe_unused]] int index ) +{ + return L"uchardet.dll"; +} + +/*! + DLLの初期化 + + 関数のアドレスを取得してメンバに保管する. + + @retval true 成功 + @retval false アドレス取得に失敗 +*/ +bool CUchardet::InitDllImp() +{ + // DLL内関数名リスト + const ImportTable table[] = { + { &_uchardet_new, "uchardet_new" }, + { &_uchardet_delete, "uchardet_delete" }, + { &_uchardet_handle_data, "uchardet_handle_data" }, + { &_uchardet_data_end, "uchardet_data_end" }, + { &_uchardet_reset, "uchardet_reset" }, + { &_uchardet_get_charset, "uchardet_get_charset" }, + { NULL, 0 } + }; + return RegisterEntries(table); +} + diff --git a/sakura_core/extmodule/CUchardet.h b/sakura_core/extmodule/CUchardet.h new file mode 100644 index 0000000000..cc3aee5ad7 --- /dev/null +++ b/sakura_core/extmodule/CUchardet.h @@ -0,0 +1,60 @@ +/*! @file */ +/* + Copyright (C) 2018-2021, Sakura Editor Organization + + This software is provided 'as-is', without any express or implied + warranty. In no event will the authors be held liable for any damages + arising from the use of this software. + + Permission is granted to anyone to use this software for any purpose, + including commercial applications, and to alter it and redistribute it + freely, subject to the following restrictions: + + 1. The origin of this software must not be misrepresented; + you must not claim that you wrote the original software. + If you use this software in a product, an acknowledgment + in the product documentation would be appreciated but is + not required. + + 2. Altered source versions must be plainly marked as such, + and must not be misrepresented as being the original software. + + 3. This notice may not be removed or altered from any source + distribution. +*/ +#pragma once + +#include "CDllHandler.h" + +typedef struct uchardet * uchardet_t; + +/*! + * uchardet ライブラリ(uchardet.dll) をラップするクラス + */ +class CUchardet final : public CDllImp +{ +public: + CUchardet() noexcept = default; + + // DLL関数ポインタ + uchardet_t (*_uchardet_new)(void) = nullptr; + void (*_uchardet_delete)(uchardet_t ud) = nullptr; + int (*_uchardet_handle_data)(uchardet_t ud, const char * data, size_t len) = nullptr; + void (*_uchardet_data_end)(uchardet_t ud) = nullptr; + void (*_uchardet_reset)(uchardet_t ud) = nullptr; + const char * (*_uchardet_get_charset)(uchardet_t ud) = nullptr; + +protected: + // CDllImpインタフェース + LPCWSTR GetDllNameImp(int nIndex) override; + bool InitDllImp() override; + +public: + uchardet_t uchardet_new(void) { return _uchardet_new(); } + void uchardet_delete(uchardet_t ud) { _uchardet_delete(ud); } + int uchardet_handle_data(uchardet_t ud, const char * data, size_t len) { return _uchardet_handle_data(ud, data, len); } + void uchardet_data_end(uchardet_t ud) { _uchardet_data_end(ud); } + void uchardet_reset(uchardet_t ud) { _uchardet_reset(ud); } + const char * uchardet_get_charset(uchardet_t ud) { return _uchardet_get_charset(ud); } +}; + From 33b87cd1e6e8b414f149f4537b71d1865d5f54d4 Mon Sep 17 00:00:00 2001 From: katsuhisa yuasa Date: Tue, 21 Sep 2021 00:10:48 +0900 Subject: [PATCH 2/4] delete default constructor of CUchardet class as it can be implicitly defined --- sakura_core/extmodule/CUchardet.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/sakura_core/extmodule/CUchardet.h b/sakura_core/extmodule/CUchardet.h index cc3aee5ad7..7ffb0661d1 100644 --- a/sakura_core/extmodule/CUchardet.h +++ b/sakura_core/extmodule/CUchardet.h @@ -34,8 +34,6 @@ typedef struct uchardet * uchardet_t; class CUchardet final : public CDllImp { public: - CUchardet() noexcept = default; - // DLL関数ポインタ uchardet_t (*_uchardet_new)(void) = nullptr; void (*_uchardet_delete)(uchardet_t ud) = nullptr; From bec37806e93fc7c7171c57a38cd3b405ec089ce8 Mon Sep 17 00:00:00 2001 From: katsuhisa yuasa Date: Tue, 21 Sep 2021 00:57:39 +0900 Subject: [PATCH 3/4] =?UTF-8?q?SonarCloud=20=E3=81=AE=20Code=20Smells=20?= =?UTF-8?q?=E6=95=B0=E3=82=92=E6=B8=9B=E3=82=89=E3=81=99=E7=82=BA=E3=81=AE?= =?UTF-8?q?=E5=A4=89=E6=9B=B4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- sakura_core/charset/icu4c/CharsetDetector.cpp | 6 ++---- sakura_core/charset/icu4c/CharsetDetector.h | 2 +- sakura_core/extmodule/CUchardet.cpp | 2 +- sakura_core/extmodule/CUchardet.h | 12 ++++++------ 4 files changed, 10 insertions(+), 12 deletions(-) diff --git a/sakura_core/charset/icu4c/CharsetDetector.cpp b/sakura_core/charset/icu4c/CharsetDetector.cpp index 50bb01bad9..cc13b813ff 100644 --- a/sakura_core/charset/icu4c/CharsetDetector.cpp +++ b/sakura_core/charset/icu4c/CharsetDetector.cpp @@ -28,7 +28,6 @@ CharsetDetector::CharsetDetector() noexcept : _icuin() , _csd(nullptr) - , _ud(nullptr) { _icuin.InitDll(); _uchardet.InitDll(); @@ -97,14 +96,13 @@ ECodeType CharsetDetector::Detect(const std::string_view& bytes) if (!_ud) { return CODE_ERROR; } - int ret = _uchardet.uchardet_handle_data(_ud, bytes.data(), bytes.length()); - if (ret != 0) { + _uchardet.uchardet_reset(_ud); + if (_uchardet.uchardet_handle_data(_ud, bytes.data(), bytes.length()) != 0) { return CODE_ERROR; } _uchardet.uchardet_data_end(_ud); std::string_view name = _uchardet.uchardet_get_charset(_ud); auto code = name2code(name); - _uchardet.uchardet_reset(_ud); return code; } return CODE_ERROR; diff --git a/sakura_core/charset/icu4c/CharsetDetector.h b/sakura_core/charset/icu4c/CharsetDetector.h index 1f36ff7d47..4e8171a162 100644 --- a/sakura_core/charset/icu4c/CharsetDetector.h +++ b/sakura_core/charset/icu4c/CharsetDetector.h @@ -40,7 +40,7 @@ class CharsetDetector final UCharsetDetector* _csd; CUchardet _uchardet; - uchardet_t _ud; + uchardet_t _ud = nullptr; public: CharsetDetector() noexcept; diff --git a/sakura_core/extmodule/CUchardet.cpp b/sakura_core/extmodule/CUchardet.cpp index 5bcac38612..149b8ccb20 100644 --- a/sakura_core/extmodule/CUchardet.cpp +++ b/sakura_core/extmodule/CUchardet.cpp @@ -51,7 +51,7 @@ bool CUchardet::InitDllImp() { &_uchardet_data_end, "uchardet_data_end" }, { &_uchardet_reset, "uchardet_reset" }, { &_uchardet_get_charset, "uchardet_get_charset" }, - { NULL, 0 } + { nullptr, 0 } }; return RegisterEntries(table); } diff --git a/sakura_core/extmodule/CUchardet.h b/sakura_core/extmodule/CUchardet.h index 7ffb0661d1..c73c1e972e 100644 --- a/sakura_core/extmodule/CUchardet.h +++ b/sakura_core/extmodule/CUchardet.h @@ -48,11 +48,11 @@ class CUchardet final : public CDllImp bool InitDllImp() override; public: - uchardet_t uchardet_new(void) { return _uchardet_new(); } - void uchardet_delete(uchardet_t ud) { _uchardet_delete(ud); } - int uchardet_handle_data(uchardet_t ud, const char * data, size_t len) { return _uchardet_handle_data(ud, data, len); } - void uchardet_data_end(uchardet_t ud) { _uchardet_data_end(ud); } - void uchardet_reset(uchardet_t ud) { _uchardet_reset(ud); } - const char * uchardet_get_charset(uchardet_t ud) { return _uchardet_get_charset(ud); } + uchardet_t uchardet_new(void) const { return _uchardet_new(); } + void uchardet_delete(uchardet_t ud) const { _uchardet_delete(ud); } + int uchardet_handle_data(uchardet_t ud, const char * data, size_t len) const { return _uchardet_handle_data(ud, data, len); } + void uchardet_data_end(uchardet_t ud) const { _uchardet_data_end(ud); } + void uchardet_reset(uchardet_t ud) const { _uchardet_reset(ud); } + const char * uchardet_get_charset(uchardet_t ud) const { return _uchardet_get_charset(ud); } }; From 34d4bc8afb52b9857d5a24511396b42eb2532edd Mon Sep 17 00:00:00 2001 From: katsuhisa yuasa Date: Sun, 26 Sep 2021 05:17:16 +0900 Subject: [PATCH 4/4] =?UTF-8?q?CharsetDetector=20=E3=82=92=E8=A6=AA?= =?UTF-8?q?=E3=83=95=E3=82=A9=E3=83=AB=E3=83=80=E3=81=AB=E7=A7=BB=E5=8B=95?= =?UTF-8?q?=20uchardet=E3=83=A9=E3=82=A4=E3=83=96=E3=83=A9=E3=83=AA?= =?UTF-8?q?=E3=82=92=E4=BD=BF=E3=81=86=E5=A0=B4=E5=90=88=E3=81=AF=E6=A4=9C?= =?UTF-8?q?=E5=87=BA=E3=81=97=E3=81=9F=E3=82=A8=E3=83=B3=E3=82=B3=E3=83=BC?= =?UTF-8?q?=E3=83=87=E3=82=A3=E3=83=B3=E3=82=B0=E5=90=8D=E3=81=8B=E3=82=89?= =?UTF-8?q?Windows=20code=20page=E3=81=B8=E3=81=AE=E5=A4=89=E6=8F=9B?= =?UTF-8?q?=E3=81=AF=20std::unordered=5Fmap=20=E3=81=AB=E3=82=88=E3=82=8B?= =?UTF-8?q?=E8=A1=A8=E5=BC=95=E3=81=8D=E3=82=92=E8=A1=8C=E3=81=86=E3=82=88?= =?UTF-8?q?=E3=81=86=E3=81=AB=E5=A4=89=E6=9B=B4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- sakura/sakura.vcxproj | 4 +- sakura/sakura.vcxproj.filters | 15 +- sakura_core/StdAfx.h | 1 + sakura_core/charset/CCodeMediator.cpp | 2 +- sakura_core/charset/CharsetDetector.cpp | 258 ++++++++++++++++++ .../charset/{icu4c => }/CharsetDetector.h | 0 sakura_core/charset/icu4c/CharsetDetector.cpp | 109 -------- 7 files changed, 268 insertions(+), 121 deletions(-) create mode 100644 sakura_core/charset/CharsetDetector.cpp rename sakura_core/charset/{icu4c => }/CharsetDetector.h (100%) delete mode 100644 sakura_core/charset/icu4c/CharsetDetector.cpp diff --git a/sakura/sakura.vcxproj b/sakura/sakura.vcxproj index 2fdc7ff9f2..91b9d93ff9 100644 --- a/sakura/sakura.vcxproj +++ b/sakura/sakura.vcxproj @@ -290,6 +290,7 @@ + @@ -299,7 +300,6 @@ - @@ -639,6 +639,7 @@ + @@ -648,7 +649,6 @@ - diff --git a/sakura/sakura.vcxproj.filters b/sakura/sakura.vcxproj.filters index 26e57cd9d7..baa41406b6 100644 --- a/sakura/sakura.vcxproj.filters +++ b/sakura/sakura.vcxproj.filters @@ -119,9 +119,6 @@ {930f3f82-ab3f-49e3-af4a-d4f9c2d51f46} - - {e4629f85-3be8-4dda-80db-1be310929433} - @@ -1085,9 +1082,6 @@ Cpp Source Files\extmodule - - Cpp Source Files\charset\icu4c - Cpp Source Files @@ -1115,6 +1109,9 @@ Cpp Source Files\extmodule + + Cpp Source Files\charset + @@ -2285,9 +2282,6 @@ Cpp Source Files\extmodule - - Cpp Source Files\charset\icu4c - Cpp Source Files @@ -2312,6 +2306,9 @@ Cpp Source Files\extmodule + + Cpp Source Files\charset + diff --git a/sakura_core/StdAfx.h b/sakura_core/StdAfx.h index d08b9d0385..c649a856af 100644 --- a/sakura_core/StdAfx.h +++ b/sakura_core/StdAfx.h @@ -84,6 +84,7 @@ #include #include #include +#include #include #include #include diff --git a/sakura_core/charset/CCodeMediator.cpp b/sakura_core/charset/CCodeMediator.cpp index 5b0a3907ea..e5ee7ca234 100644 --- a/sakura_core/charset/CCodeMediator.cpp +++ b/sakura_core/charset/CCodeMediator.cpp @@ -24,7 +24,7 @@ */ #include "StdAfx.h" #include "charset/CCodeMediator.h" -#include "charset/icu4c/CharsetDetector.h" +#include "charset/CharsetDetector.h" #include "charset/CESI.h" #include "io/CBinaryStream.h" #include "mem/CMemory.h" diff --git a/sakura_core/charset/CharsetDetector.cpp b/sakura_core/charset/CharsetDetector.cpp new file mode 100644 index 0000000000..ff6fb197a4 --- /dev/null +++ b/sakura_core/charset/CharsetDetector.cpp @@ -0,0 +1,258 @@ +/*! @file */ +/* + Copyright (C) 2018-2021, Sakura Editor Organization + + This software is provided 'as-is', without any express or implied + warranty. In no event will the authors be held liable for any damages + arising from the use of this software. + + Permission is granted to anyone to use this software for any purpose, + including commercial applications, and to alter it and redistribute it + freely, subject to the following restrictions: + + 1. The origin of this software must not be misrepresented; + you must not claim that you wrote the original software. + If you use this software in a product, an acknowledgment + in the product documentation would be appreciated but is + not required. + + 2. Altered source versions must be plainly marked as such, + and must not be misrepresented as being the original software. + + 3. This notice may not be removed or altered from any source + distribution. +*/ +#include "StdAfx.h" +#include "CharsetDetector.h" + +// https://www.freedesktop.org/wiki/Software/uchardet/ +// https://docs.microsoft.com/en-us/windows/win32/intl/code-page-identifiers +static const std::unordered_map map_charsetToCodePage = { + // International (Unicode) + { "UTF-32BE", 12001, }, + { "UTF-32LE", 12000, }, + // Arabic + { "ISO-8859-6", 28596, }, + { "WINDOWS-1256", 1256, }, + // Bulgarian + { "ISO-8859-5", 28595, }, + { "WINDOWS-1251", 1251, }, + // Chinese + { "ISO-2022-CN", 50227, }, + { "BIG5", 950, }, + { "EUC-TW", 51950, }, + { "GB18030", 54936, }, + { "HZ-GB-2312", 52936, }, + // Croatian + { "ISO-8859-2", 28592, }, + { "ISO-8859-13", 28603, }, + { "ISO-8859-16", 28606, }, + { "WINDOWS-1250", 1250, }, + { "IBM852", 852, }, + { "MAC-CENTRALEUROPE", 10029, }, + // Czech + { "WINDOWS-1250", 1250, }, + { "ISO-8859-2", 28592, }, + { "IBM852", 852, }, + { "MAC-CENTRALEUROPE", 10029, }, + // Danish + { "ISO-8859-1", 28591, }, + { "ISO-8859-15", 28605, }, + { "WINDOWS-1252", 1252, }, + // English + { "ASCII", 20127, }, + // Esperanto + { "ISO-8859-3", 28593, }, + // Estonian + { "ISO-8859-4", 28594, }, + { "ISO-8859-13", 28603, }, + { "ISO-8859-13", 28603, }, + { "WINDOWS-1252", 1252, }, + { "WINDOWS-1257", 1257, }, + // Finnish + { "ISO-8859-1", 28591, }, + { "ISO-8859-4", 28594, }, + { "ISO-8859-9", 28599, }, + { "ISO-8859-13", 28603, }, + { "ISO-8859-15", 28605, }, + { "WINDOWS-1252", 1252, }, + // French + { "ISO-8859-1", 28591, }, + { "ISO-8859-15", 28605, }, + { "WINDOWS-1252", 1252, }, + // German + { "ISO-8859-1", 28591, }, + { "WINDOWS-1252", 1252, }, + // Greek + { "ISO-8859-7", 28597, }, + { "WINDOWS-1253", 1253, }, + // Hebrew + { "ISO-8859-8", 28598, }, + { "WINDOWS-1255", 1255, }, + // Hungarian + { "ISO-8859-2", 28592, }, + { "WINDOWS-1250", 1250, }, + // Irish Gaelic + { "ISO-8859-1", 28591, }, + { "ISO-8859-9", 28599, }, + { "ISO-8859-15", 28605, }, + { "WINDOWS-1252", 1252, }, + // Italian + { "ISO-8859-1", 28591, }, + { "ISO-8859-3", 28593, }, + { "ISO-8859-9", 28599, }, + { "ISO-8859-15", 28605, }, + { "WINDOWS-1252", 1252, }, + // Japanese + { "ISO-2022-JP", 50220, }, + { "SHIFT_JIS", 932, }, + { "EUC-JP", 20932, }, + // Korean + { "ISO-2022-KR", 50225, }, + { "EUC-KR", 51949, }, + // Lithuanian + { "ISO-8859-4", 28594, }, + { "ISO-8859-10", 28600, }, + { "ISO-8859-13", 28603, }, + // Latvian + { "ISO-8859-4", 28594, }, + { "ISO-8859-10", 28600, }, + { "ISO-8859-13", 28603, }, + // Maltese + { "ISO-8859-3", 28593, }, + // Polish + { "ISO-8859-2", 28592, }, + { "ISO-8859-13", 28603, }, + { "ISO-8859-16", 28606, }, + { "WINDOWS-1250", 1250, }, + { "IBM852", 852, }, + { "MAC-CENTRALEUROPE", 10029, }, + // Portuguese + { "ISO-8859-1", 28591, }, + { "ISO-8859-9", 28599, }, + { "ISO-8859-15", 28605, }, + { "WINDOWS-1252", 1252, }, + // Romanian + { "ISO-8859-2", 28592, }, + { "ISO-8859-16", 28606, }, + { "WINDOWS-1250", 1250, }, + { "IBM852", 852, }, + // Russian + { "ISO-8859-5", 28595, }, + { "KOI8-R", 20866, }, + { "WINDOWS-1251", 1251, }, + { "MAC-CYRILLIC", 10007, }, + { "IBM866", 866, }, + { "IBM855", 855, }, + // Slovak + { "WINDOWS-1250", 1250, }, + { "ISO-8859-2", 28592, }, + { "IBM852", 852, }, + { "MAC-CENTRALEUROPE", 10029, }, + // Slovene + { "ISO-8859-2", 28592, }, + { "ISO-8859-16", 28606, }, + { "WINDOWS-1250", 1250, }, + { "IBM852", 852, }, + { "MAC-CENTRALEUROPE", 10029, }, + // Spanish + { "ISO-8859-1", 28591, }, + { "ISO-8859-15", 28605, }, + { "WINDOWS-1252", 1252, }, + // Swedish + { "ISO-8859-1", 28591, }, + { "ISO-8859-4", 28594, }, + { "ISO-8859-9", 28599, }, + { "ISO-8859-15", 28605, }, + { "WINDOWS-1252", 1252, }, + // Thai + { "TIS-620", 874, }, + { "ISO-8859-11", 28601, }, + // Turkish + { "ISO-8859-3", 28593, }, + { "ISO-8859-9", 28599, }, + // Vietnamese + //{ "VISCII", , }, + { "Windows-1258", 1258, }, + // Others + { "WINDOWS-1252", 1252, }, +}; + +CharsetDetector::CharsetDetector() noexcept + : _icuin() + , _csd(nullptr) +{ + _icuin.InitDll(); + _uchardet.InitDll(); +} + +CharsetDetector::~CharsetDetector() noexcept +{ + if (_icuin.IsAvailable()) { + _icuin.ucsdet_close(_csd); + } + if (_uchardet.IsAvailable()) { + _uchardet.uchardet_delete(_ud); + } +} + +ECodeType CharsetDetector::Detect(const std::string_view& bytes) +{ + if (_icuin.IsAvailable()) { + UErrorCode status = U_ZERO_ERROR; + _csd = _icuin.ucsdet_open(&status); + if (status != U_ZERO_ERROR) { + return CODE_ERROR; + } + + _icuin.ucsdet_setText(_csd, bytes.data(), bytes.length(), &status); + if (status != U_ZERO_ERROR) { + return CODE_ERROR; + } + + const auto csm = _icuin.ucsdet_detect(_csd, &status); + if (status != U_ZERO_ERROR) { + return CODE_ERROR; + } + + std::string_view name = _icuin.ucsdet_getName(csm, &status); + if (status != U_ZERO_ERROR) { + return CODE_ERROR; + } + + // 文字セット名⇒サクラエディタ内部コードの変換 + if (name == "UTF-8") return CODE_UTF8; + if (name == "SHIFT_JIS") return CODE_SJIS; + if (name == "UTF-16BE") return CODE_UNICODEBE; + if (name == "UTF-16LE") return CODE_UNICODE; + if (name == "EUC-JP") return CODE_EUC; + if (name == "ISO-2022-JP") return CODE_JIS; + if (name == "UTF-7") return CODE_UTF7; + if (name == "ISO-8859-1") return CODE_LATIN1; + + return CODE_ERROR; + } + if (_uchardet.IsAvailable()) { + if (!_ud) { + _ud = _uchardet.uchardet_new(); + } + if (!_ud) { + return CODE_ERROR; + } + _uchardet.uchardet_reset(_ud); + if (_uchardet.uchardet_handle_data(_ud, bytes.data(), bytes.length()) != 0) { + return CODE_ERROR; + } + _uchardet.uchardet_data_end(_ud); + std::string_view name = _uchardet.uchardet_get_charset(_ud); + std::string str(name); + std::transform(str.begin(), str.end(), str.begin(), ::toupper); + auto it = map_charsetToCodePage.find(str); + if (it == map_charsetToCodePage.end()) { + return CODE_ERROR; + }else { + return (ECodeType)it->second; + } + } + return CODE_ERROR; +} diff --git a/sakura_core/charset/icu4c/CharsetDetector.h b/sakura_core/charset/CharsetDetector.h similarity index 100% rename from sakura_core/charset/icu4c/CharsetDetector.h rename to sakura_core/charset/CharsetDetector.h diff --git a/sakura_core/charset/icu4c/CharsetDetector.cpp b/sakura_core/charset/icu4c/CharsetDetector.cpp deleted file mode 100644 index cc13b813ff..0000000000 --- a/sakura_core/charset/icu4c/CharsetDetector.cpp +++ /dev/null @@ -1,109 +0,0 @@ -/*! @file */ -/* - Copyright (C) 2018-2021, Sakura Editor Organization - - This software is provided 'as-is', without any express or implied - warranty. In no event will the authors be held liable for any damages - arising from the use of this software. - - Permission is granted to anyone to use this software for any purpose, - including commercial applications, and to alter it and redistribute it - freely, subject to the following restrictions: - - 1. The origin of this software must not be misrepresented; - you must not claim that you wrote the original software. - If you use this software in a product, an acknowledgment - in the product documentation would be appreciated but is - not required. - - 2. Altered source versions must be plainly marked as such, - and must not be misrepresented as being the original software. - - 3. This notice may not be removed or altered from any source - distribution. -*/ -#include "StdAfx.h" -#include "CharsetDetector.h" - -CharsetDetector::CharsetDetector() noexcept - : _icuin() - , _csd(nullptr) -{ - _icuin.InitDll(); - _uchardet.InitDll(); -} - -CharsetDetector::~CharsetDetector() noexcept -{ - if (_icuin.IsAvailable()) { - _icuin.ucsdet_close(_csd); - } - if (_uchardet.IsAvailable()) { - _uchardet.uchardet_delete(_ud); - } -} - -static ECodeType name2code(std::string_view name) -{ - // 文字セット名⇒サクラエディタ内部コードの変換 - if (name == "UTF-8") return CODE_UTF8; - if (name == "SHIFT_JIS") return CODE_SJIS; - if (name == "UTF-16BE") return CODE_UNICODEBE; - if (name == "UTF-16LE") return CODE_UNICODE; - if (name == "EUC-JP") return CODE_EUC; - if (name == "ISO-2022-JP") return CODE_JIS; - if (name == "UTF-7") return CODE_UTF7; - if (name == "ISO-8859-1") return CODE_LATIN1; - // ここから下は数が多いのでどうしたものか… - // https://www.freedesktop.org/wiki/Software/uchardet/ - // https://docs.microsoft.com/en-us/windows/win32/intl/code-page-identifiers - if (name == "GB2312") return (ECodeType)936; - if (name == "BIG5") return (ECodeType)950; - if (name == "ISO-2022-KR") return (ECodeType)50225; - if (name == "GB18030") return (ECodeType)54936; - return CODE_ERROR; -} - -ECodeType CharsetDetector::Detect(const std::string_view& bytes) -{ - if (_icuin.IsAvailable()) { - UErrorCode status = U_ZERO_ERROR; - _csd = _icuin.ucsdet_open(&status); - if (status != U_ZERO_ERROR) { - return CODE_ERROR; - } - - _icuin.ucsdet_setText(_csd, bytes.data(), bytes.length(), &status); - if (status != U_ZERO_ERROR) { - return CODE_ERROR; - } - - const auto csm = _icuin.ucsdet_detect(_csd, &status); - if (status != U_ZERO_ERROR) { - return CODE_ERROR; - } - - std::string_view name = _icuin.ucsdet_getName(csm, &status); - if (status != U_ZERO_ERROR) { - return CODE_ERROR; - } - return name2code(name); - } - if (_uchardet.IsAvailable()) { - if (!_ud) { - _ud = _uchardet.uchardet_new(); - } - if (!_ud) { - return CODE_ERROR; - } - _uchardet.uchardet_reset(_ud); - if (_uchardet.uchardet_handle_data(_ud, bytes.data(), bytes.length()) != 0) { - return CODE_ERROR; - } - _uchardet.uchardet_data_end(_ud); - std::string_view name = _uchardet.uchardet_get_charset(_ud); - auto code = name2code(name); - return code; - } - return CODE_ERROR; -}