diff --git a/sakura/sakura.vcxproj b/sakura/sakura.vcxproj index 7d606076de..91b9d93ff9 100644 --- a/sakura/sakura.vcxproj +++ b/sakura/sakura.vcxproj @@ -290,6 +290,7 @@ + @@ -299,7 +300,6 @@ - @@ -415,6 +415,7 @@ + @@ -638,6 +639,7 @@ + @@ -647,7 +649,6 @@ - @@ -783,6 +784,7 @@ + diff --git a/sakura/sakura.vcxproj.filters b/sakura/sakura.vcxproj.filters index 659a552187..baa41406b6 100644 --- a/sakura/sakura.vcxproj.filters +++ b/sakura/sakura.vcxproj.filters @@ -119,9 +119,6 @@ {930f3f82-ab3f-49e3-af4a-d4f9c2d51f46} - - {e4629f85-3be8-4dda-80db-1be310929433} - @@ -1085,9 +1082,6 @@ Cpp Source Files\extmodule - - Cpp Source Files\charset\icu4c - Cpp Source Files @@ -1112,6 +1106,12 @@ Cpp Source Files\recent + + Cpp Source Files\extmodule + + + Cpp Source Files\charset + @@ -2282,9 +2282,6 @@ Cpp Source Files\extmodule - - Cpp Source Files\charset\icu4c - Cpp Source Files @@ -2306,6 +2303,12 @@ Cpp Source Files\convert + + Cpp Source Files\extmodule + + + Cpp Source Files\charset + diff --git a/sakura_core/StdAfx.h b/sakura_core/StdAfx.h index d08b9d0385..c649a856af 100644 --- a/sakura_core/StdAfx.h +++ b/sakura_core/StdAfx.h @@ -84,6 +84,7 @@ #include #include #include +#include #include #include #include diff --git a/sakura_core/charset/CCodeMediator.cpp b/sakura_core/charset/CCodeMediator.cpp index 5b0a3907ea..e5ee7ca234 100644 --- a/sakura_core/charset/CCodeMediator.cpp +++ b/sakura_core/charset/CCodeMediator.cpp @@ -24,7 +24,7 @@ */ #include "StdAfx.h" #include "charset/CCodeMediator.h" -#include "charset/icu4c/CharsetDetector.h" +#include "charset/CharsetDetector.h" #include "charset/CESI.h" #include "io/CBinaryStream.h" #include "mem/CMemory.h" diff --git a/sakura_core/charset/CharsetDetector.cpp b/sakura_core/charset/CharsetDetector.cpp new file mode 100644 index 0000000000..ff6fb197a4 --- /dev/null +++ b/sakura_core/charset/CharsetDetector.cpp @@ -0,0 +1,258 @@ +/*! @file */ +/* + Copyright (C) 2018-2021, Sakura Editor Organization + + This software is provided 'as-is', without any express or implied + warranty. In no event will the authors be held liable for any damages + arising from the use of this software. + + Permission is granted to anyone to use this software for any purpose, + including commercial applications, and to alter it and redistribute it + freely, subject to the following restrictions: + + 1. The origin of this software must not be misrepresented; + you must not claim that you wrote the original software. + If you use this software in a product, an acknowledgment + in the product documentation would be appreciated but is + not required. + + 2. Altered source versions must be plainly marked as such, + and must not be misrepresented as being the original software. + + 3. This notice may not be removed or altered from any source + distribution. +*/ +#include "StdAfx.h" +#include "CharsetDetector.h" + +// https://www.freedesktop.org/wiki/Software/uchardet/ +// https://docs.microsoft.com/en-us/windows/win32/intl/code-page-identifiers +static const std::unordered_map map_charsetToCodePage = { + // International (Unicode) + { "UTF-32BE", 12001, }, + { "UTF-32LE", 12000, }, + // Arabic + { "ISO-8859-6", 28596, }, + { "WINDOWS-1256", 1256, }, + // Bulgarian + { "ISO-8859-5", 28595, }, + { "WINDOWS-1251", 1251, }, + // Chinese + { "ISO-2022-CN", 50227, }, + { "BIG5", 950, }, + { "EUC-TW", 51950, }, + { "GB18030", 54936, }, + { "HZ-GB-2312", 52936, }, + // Croatian + { "ISO-8859-2", 28592, }, + { "ISO-8859-13", 28603, }, + { "ISO-8859-16", 28606, }, + { "WINDOWS-1250", 1250, }, + { "IBM852", 852, }, + { "MAC-CENTRALEUROPE", 10029, }, + // Czech + { "WINDOWS-1250", 1250, }, + { "ISO-8859-2", 28592, }, + { "IBM852", 852, }, + { "MAC-CENTRALEUROPE", 10029, }, + // Danish + { "ISO-8859-1", 28591, }, + { "ISO-8859-15", 28605, }, + { "WINDOWS-1252", 1252, }, + // English + { "ASCII", 20127, }, + // Esperanto + { "ISO-8859-3", 28593, }, + // Estonian + { "ISO-8859-4", 28594, }, + { "ISO-8859-13", 28603, }, + { "ISO-8859-13", 28603, }, + { "WINDOWS-1252", 1252, }, + { "WINDOWS-1257", 1257, }, + // Finnish + { "ISO-8859-1", 28591, }, + { "ISO-8859-4", 28594, }, + { "ISO-8859-9", 28599, }, + { "ISO-8859-13", 28603, }, + { "ISO-8859-15", 28605, }, + { "WINDOWS-1252", 1252, }, + // French + { "ISO-8859-1", 28591, }, + { "ISO-8859-15", 28605, }, + { "WINDOWS-1252", 1252, }, + // German + { "ISO-8859-1", 28591, }, + { "WINDOWS-1252", 1252, }, + // Greek + { "ISO-8859-7", 28597, }, + { "WINDOWS-1253", 1253, }, + // Hebrew + { "ISO-8859-8", 28598, }, + { "WINDOWS-1255", 1255, }, + // Hungarian + { "ISO-8859-2", 28592, }, + { "WINDOWS-1250", 1250, }, + // Irish Gaelic + { "ISO-8859-1", 28591, }, + { "ISO-8859-9", 28599, }, + { "ISO-8859-15", 28605, }, + { "WINDOWS-1252", 1252, }, + // Italian + { "ISO-8859-1", 28591, }, + { "ISO-8859-3", 28593, }, + { "ISO-8859-9", 28599, }, + { "ISO-8859-15", 28605, }, + { "WINDOWS-1252", 1252, }, + // Japanese + { "ISO-2022-JP", 50220, }, + { "SHIFT_JIS", 932, }, + { "EUC-JP", 20932, }, + // Korean + { "ISO-2022-KR", 50225, }, + { "EUC-KR", 51949, }, + // Lithuanian + { "ISO-8859-4", 28594, }, + { "ISO-8859-10", 28600, }, + { "ISO-8859-13", 28603, }, + // Latvian + { "ISO-8859-4", 28594, }, + { "ISO-8859-10", 28600, }, + { "ISO-8859-13", 28603, }, + // Maltese + { "ISO-8859-3", 28593, }, + // Polish + { "ISO-8859-2", 28592, }, + { "ISO-8859-13", 28603, }, + { "ISO-8859-16", 28606, }, + { "WINDOWS-1250", 1250, }, + { "IBM852", 852, }, + { "MAC-CENTRALEUROPE", 10029, }, + // Portuguese + { "ISO-8859-1", 28591, }, + { "ISO-8859-9", 28599, }, + { "ISO-8859-15", 28605, }, + { "WINDOWS-1252", 1252, }, + // Romanian + { "ISO-8859-2", 28592, }, + { "ISO-8859-16", 28606, }, + { "WINDOWS-1250", 1250, }, + { "IBM852", 852, }, + // Russian + { "ISO-8859-5", 28595, }, + { "KOI8-R", 20866, }, + { "WINDOWS-1251", 1251, }, + { "MAC-CYRILLIC", 10007, }, + { "IBM866", 866, }, + { "IBM855", 855, }, + // Slovak + { "WINDOWS-1250", 1250, }, + { "ISO-8859-2", 28592, }, + { "IBM852", 852, }, + { "MAC-CENTRALEUROPE", 10029, }, + // Slovene + { "ISO-8859-2", 28592, }, + { "ISO-8859-16", 28606, }, + { "WINDOWS-1250", 1250, }, + { "IBM852", 852, }, + { "MAC-CENTRALEUROPE", 10029, }, + // Spanish + { "ISO-8859-1", 28591, }, + { "ISO-8859-15", 28605, }, + { "WINDOWS-1252", 1252, }, + // Swedish + { "ISO-8859-1", 28591, }, + { "ISO-8859-4", 28594, }, + { "ISO-8859-9", 28599, }, + { "ISO-8859-15", 28605, }, + { "WINDOWS-1252", 1252, }, + // Thai + { "TIS-620", 874, }, + { "ISO-8859-11", 28601, }, + // Turkish + { "ISO-8859-3", 28593, }, + { "ISO-8859-9", 28599, }, + // Vietnamese + //{ "VISCII", , }, + { "Windows-1258", 1258, }, + // Others + { "WINDOWS-1252", 1252, }, +}; + +CharsetDetector::CharsetDetector() noexcept + : _icuin() + , _csd(nullptr) +{ + _icuin.InitDll(); + _uchardet.InitDll(); +} + +CharsetDetector::~CharsetDetector() noexcept +{ + if (_icuin.IsAvailable()) { + _icuin.ucsdet_close(_csd); + } + if (_uchardet.IsAvailable()) { + _uchardet.uchardet_delete(_ud); + } +} + +ECodeType CharsetDetector::Detect(const std::string_view& bytes) +{ + if (_icuin.IsAvailable()) { + UErrorCode status = U_ZERO_ERROR; + _csd = _icuin.ucsdet_open(&status); + if (status != U_ZERO_ERROR) { + return CODE_ERROR; + } + + _icuin.ucsdet_setText(_csd, bytes.data(), bytes.length(), &status); + if (status != U_ZERO_ERROR) { + return CODE_ERROR; + } + + const auto csm = _icuin.ucsdet_detect(_csd, &status); + if (status != U_ZERO_ERROR) { + return CODE_ERROR; + } + + std::string_view name = _icuin.ucsdet_getName(csm, &status); + if (status != U_ZERO_ERROR) { + return CODE_ERROR; + } + + // 文字セット名⇒サクラエディタ内部コードの変換 + if (name == "UTF-8") return CODE_UTF8; + if (name == "SHIFT_JIS") return CODE_SJIS; + if (name == "UTF-16BE") return CODE_UNICODEBE; + if (name == "UTF-16LE") return CODE_UNICODE; + if (name == "EUC-JP") return CODE_EUC; + if (name == "ISO-2022-JP") return CODE_JIS; + if (name == "UTF-7") return CODE_UTF7; + if (name == "ISO-8859-1") return CODE_LATIN1; + + return CODE_ERROR; + } + if (_uchardet.IsAvailable()) { + if (!_ud) { + _ud = _uchardet.uchardet_new(); + } + if (!_ud) { + return CODE_ERROR; + } + _uchardet.uchardet_reset(_ud); + if (_uchardet.uchardet_handle_data(_ud, bytes.data(), bytes.length()) != 0) { + return CODE_ERROR; + } + _uchardet.uchardet_data_end(_ud); + std::string_view name = _uchardet.uchardet_get_charset(_ud); + std::string str(name); + std::transform(str.begin(), str.end(), str.begin(), ::toupper); + auto it = map_charsetToCodePage.find(str); + if (it == map_charsetToCodePage.end()) { + return CODE_ERROR; + }else { + return (ECodeType)it->second; + } + } + return CODE_ERROR; +} diff --git a/sakura_core/charset/icu4c/CharsetDetector.h b/sakura_core/charset/CharsetDetector.h similarity index 91% rename from sakura_core/charset/icu4c/CharsetDetector.h rename to sakura_core/charset/CharsetDetector.h index 23699a2601..4e8171a162 100644 --- a/sakura_core/charset/icu4c/CharsetDetector.h +++ b/sakura_core/charset/CharsetDetector.h @@ -29,6 +29,7 @@ #include #include "extmodule/CIcu4cI18n.h" +#include "extmodule/CUchardet.h" /*! * @brief 文字コード検出クラス @@ -38,12 +39,15 @@ class CharsetDetector final CIcu4cI18n _icuin; UCharsetDetector* _csd; + CUchardet _uchardet; + uchardet_t _ud = nullptr; + public: CharsetDetector() noexcept; ~CharsetDetector() noexcept; bool IsAvailable() const noexcept { - return _icuin.IsAvailable(); + return _icuin.IsAvailable() || _uchardet.IsAvailable(); } ECodeType Detect(const std::string_view& bytes); diff --git a/sakura_core/charset/icu4c/CharsetDetector.cpp b/sakura_core/charset/icu4c/CharsetDetector.cpp deleted file mode 100644 index 78c3f7400f..0000000000 --- a/sakura_core/charset/icu4c/CharsetDetector.cpp +++ /dev/null @@ -1,77 +0,0 @@ -/*! @file */ -/* - Copyright (C) 2018-2021, Sakura Editor Organization - - This software is provided 'as-is', without any express or implied - warranty. In no event will the authors be held liable for any damages - arising from the use of this software. - - Permission is granted to anyone to use this software for any purpose, - including commercial applications, and to alter it and redistribute it - freely, subject to the following restrictions: - - 1. The origin of this software must not be misrepresented; - you must not claim that you wrote the original software. - If you use this software in a product, an acknowledgment - in the product documentation would be appreciated but is - not required. - - 2. Altered source versions must be plainly marked as such, - and must not be misrepresented as being the original software. - - 3. This notice may not be removed or altered from any source - distribution. -*/ -#include "StdAfx.h" -#include "CharsetDetector.h" - -CharsetDetector::CharsetDetector() noexcept - : _icuin() - , _csd(nullptr) -{ - _icuin.InitDll(); -} - -CharsetDetector::~CharsetDetector() noexcept -{ - if (_icuin.IsAvailable()) { - _icuin.ucsdet_close(_csd); - } -} - -ECodeType CharsetDetector::Detect(const std::string_view& bytes) -{ - UErrorCode status = U_ZERO_ERROR; - - _csd = _icuin.ucsdet_open(&status); - if (status != U_ZERO_ERROR) { - return CODE_ERROR; - } - - _icuin.ucsdet_setText(_csd, bytes.data(), bytes.length(), &status); - if (status != U_ZERO_ERROR) { - return CODE_ERROR; - } - - const auto csm = _icuin.ucsdet_detect(_csd, &status); - if (status != U_ZERO_ERROR) { - return CODE_ERROR; - } - - std::string_view name = _icuin.ucsdet_getName(csm, &status); - if (status != U_ZERO_ERROR) { - return CODE_ERROR; - } - - // 文字セット名⇒サクラエディタ内部コードの変換 - if (name == "UTF-8") return CODE_UTF8; - if (name == "SHIFT_JIS") return CODE_SJIS; - if (name == "UTF-16BE") return CODE_UNICODEBE; - if (name == "UTF-16LE") return CODE_UNICODE; - if (name == "EUC-JP") return CODE_EUC; - if (name == "ISO-2022-JP") return CODE_JIS; - if (name == "UTF-7") return CODE_UTF7; - if (name == "ISO-8859-1") return CODE_LATIN1; - - return CODE_ERROR; -} diff --git a/sakura_core/extmodule/CUchardet.cpp b/sakura_core/extmodule/CUchardet.cpp new file mode 100644 index 0000000000..149b8ccb20 --- /dev/null +++ b/sakura_core/extmodule/CUchardet.cpp @@ -0,0 +1,58 @@ +/*! @file */ +/* + Copyright (C) 2018-2021, Sakura Editor Organization + + This software is provided 'as-is', without any express or implied + warranty. In no event will the authors be held liable for any damages + arising from the use of this software. + + Permission is granted to anyone to use this software for any purpose, + including commercial applications, and to alter it and redistribute it + freely, subject to the following restrictions: + + 1. The origin of this software must not be misrepresented; + you must not claim that you wrote the original software. + If you use this software in a product, an acknowledgment + in the product documentation would be appreciated but is + not required. + + 2. Altered source versions must be plainly marked as such, + and must not be misrepresented as being the original software. + + 3. This notice may not be removed or altered from any source + distribution. +*/ +#include "StdAfx.h" +#include "CUchardet.h" + +/*! + * @brief DLLの名前を返す + */ +LPCWSTR CUchardet::GetDllNameImp( [[maybe_unused]] int index ) +{ + return L"uchardet.dll"; +} + +/*! + DLLの初期化 + + 関数のアドレスを取得してメンバに保管する. + + @retval true 成功 + @retval false アドレス取得に失敗 +*/ +bool CUchardet::InitDllImp() +{ + // DLL内関数名リスト + const ImportTable table[] = { + { &_uchardet_new, "uchardet_new" }, + { &_uchardet_delete, "uchardet_delete" }, + { &_uchardet_handle_data, "uchardet_handle_data" }, + { &_uchardet_data_end, "uchardet_data_end" }, + { &_uchardet_reset, "uchardet_reset" }, + { &_uchardet_get_charset, "uchardet_get_charset" }, + { nullptr, 0 } + }; + return RegisterEntries(table); +} + diff --git a/sakura_core/extmodule/CUchardet.h b/sakura_core/extmodule/CUchardet.h new file mode 100644 index 0000000000..c73c1e972e --- /dev/null +++ b/sakura_core/extmodule/CUchardet.h @@ -0,0 +1,58 @@ +/*! @file */ +/* + Copyright (C) 2018-2021, Sakura Editor Organization + + This software is provided 'as-is', without any express or implied + warranty. In no event will the authors be held liable for any damages + arising from the use of this software. + + Permission is granted to anyone to use this software for any purpose, + including commercial applications, and to alter it and redistribute it + freely, subject to the following restrictions: + + 1. The origin of this software must not be misrepresented; + you must not claim that you wrote the original software. + If you use this software in a product, an acknowledgment + in the product documentation would be appreciated but is + not required. + + 2. Altered source versions must be plainly marked as such, + and must not be misrepresented as being the original software. + + 3. This notice may not be removed or altered from any source + distribution. +*/ +#pragma once + +#include "CDllHandler.h" + +typedef struct uchardet * uchardet_t; + +/*! + * uchardet ライブラリ(uchardet.dll) をラップするクラス + */ +class CUchardet final : public CDllImp +{ +public: + // DLL関数ポインタ + uchardet_t (*_uchardet_new)(void) = nullptr; + void (*_uchardet_delete)(uchardet_t ud) = nullptr; + int (*_uchardet_handle_data)(uchardet_t ud, const char * data, size_t len) = nullptr; + void (*_uchardet_data_end)(uchardet_t ud) = nullptr; + void (*_uchardet_reset)(uchardet_t ud) = nullptr; + const char * (*_uchardet_get_charset)(uchardet_t ud) = nullptr; + +protected: + // CDllImpインタフェース + LPCWSTR GetDllNameImp(int nIndex) override; + bool InitDllImp() override; + +public: + uchardet_t uchardet_new(void) const { return _uchardet_new(); } + void uchardet_delete(uchardet_t ud) const { _uchardet_delete(ud); } + int uchardet_handle_data(uchardet_t ud, const char * data, size_t len) const { return _uchardet_handle_data(ud, data, len); } + void uchardet_data_end(uchardet_t ud) const { _uchardet_data_end(ud); } + void uchardet_reset(uchardet_t ud) const { _uchardet_reset(ud); } + const char * uchardet_get_charset(uchardet_t ud) const { return _uchardet_get_charset(ud); } +}; +