From bbc2f4d267ab4c3079d19ce7093465e98c63aad4 Mon Sep 17 00:00:00 2001 From: katsuhisa yuasa Date: Mon, 20 Sep 2021 23:30:12 +0900 Subject: [PATCH] =?UTF-8?q?uchardet.dll=20=E3=81=8C=E5=AD=98=E5=9C=A8?= =?UTF-8?q?=E3=81=97=E3=81=9F=E3=82=89=E3=81=9D=E3=82=8C=E3=82=92=E4=BD=BF?= =?UTF-8?q?=E3=81=A3=E3=81=A6=E6=96=87=E5=AD=97=E3=82=A8=E3=83=B3=E3=82=B3?= =?UTF-8?q?=E3=83=BC=E3=83=87=E3=82=A3=E3=83=B3=E3=82=B0=E3=81=AE=E6=A4=9C?= =?UTF-8?q?=E5=87=BA=E3=81=8C=E8=A1=8C=E3=82=8F=E3=82=8C=E3=82=8B=E3=82=88?= =?UTF-8?q?=E3=81=86=E3=81=AB=E5=87=A6=E7=90=86=E8=BF=BD=E5=8A=A0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- sakura/sakura.vcxproj | 2 + sakura/sakura.vcxproj.filters | 6 ++ sakura_core/charset/icu4c/CharsetDetector.cpp | 80 +++++++++++++------ sakura_core/charset/icu4c/CharsetDetector.h | 6 +- sakura_core/extmodule/CUchardet.cpp | 58 ++++++++++++++ sakura_core/extmodule/CUchardet.h | 60 ++++++++++++++ 6 files changed, 188 insertions(+), 24 deletions(-) create mode 100644 sakura_core/extmodule/CUchardet.cpp create mode 100644 sakura_core/extmodule/CUchardet.h diff --git a/sakura/sakura.vcxproj b/sakura/sakura.vcxproj index 7d606076de..2fdc7ff9f2 100644 --- a/sakura/sakura.vcxproj +++ b/sakura/sakura.vcxproj @@ -415,6 +415,7 @@ + @@ -783,6 +784,7 @@ + diff --git a/sakura/sakura.vcxproj.filters b/sakura/sakura.vcxproj.filters index 659a552187..26e57cd9d7 100644 --- a/sakura/sakura.vcxproj.filters +++ b/sakura/sakura.vcxproj.filters @@ -1112,6 +1112,9 @@ Cpp Source Files\recent + + Cpp Source Files\extmodule + @@ -2306,6 +2309,9 @@ Cpp Source Files\convert + + Cpp Source Files\extmodule + diff --git a/sakura_core/charset/icu4c/CharsetDetector.cpp b/sakura_core/charset/icu4c/CharsetDetector.cpp index 78c3f7400f..50bb01bad9 100644 --- a/sakura_core/charset/icu4c/CharsetDetector.cpp +++ b/sakura_core/charset/icu4c/CharsetDetector.cpp @@ -28,8 +28,10 @@ CharsetDetector::CharsetDetector() noexcept : _icuin() , _csd(nullptr) + , _ud(nullptr) { _icuin.InitDll(); + _uchardet.InitDll(); } CharsetDetector::~CharsetDetector() noexcept @@ -37,32 +39,13 @@ CharsetDetector::~CharsetDetector() noexcept if (_icuin.IsAvailable()) { _icuin.ucsdet_close(_csd); } + if (_uchardet.IsAvailable()) { + _uchardet.uchardet_delete(_ud); + } } -ECodeType CharsetDetector::Detect(const std::string_view& bytes) +static ECodeType name2code(std::string_view name) { - UErrorCode status = U_ZERO_ERROR; - - _csd = _icuin.ucsdet_open(&status); - if (status != U_ZERO_ERROR) { - return CODE_ERROR; - } - - _icuin.ucsdet_setText(_csd, bytes.data(), bytes.length(), &status); - if (status != U_ZERO_ERROR) { - return CODE_ERROR; - } - - const auto csm = _icuin.ucsdet_detect(_csd, &status); - if (status != U_ZERO_ERROR) { - return CODE_ERROR; - } - - std::string_view name = _icuin.ucsdet_getName(csm, &status); - if (status != U_ZERO_ERROR) { - return CODE_ERROR; - } - // 文字セット名⇒サクラエディタ内部コードの変換 if (name == "UTF-8") return CODE_UTF8; if (name == "SHIFT_JIS") return CODE_SJIS; @@ -72,6 +55,57 @@ ECodeType CharsetDetector::Detect(const std::string_view& bytes) if (name == "ISO-2022-JP") return CODE_JIS; if (name == "UTF-7") return CODE_UTF7; if (name == "ISO-8859-1") return CODE_LATIN1; + // ここから下は数が多いのでどうしたものか… + // https://www.freedesktop.org/wiki/Software/uchardet/ + // https://docs.microsoft.com/en-us/windows/win32/intl/code-page-identifiers + if (name == "GB2312") return (ECodeType)936; + if (name == "BIG5") return (ECodeType)950; + if (name == "ISO-2022-KR") return (ECodeType)50225; + if (name == "GB18030") return (ECodeType)54936; + return CODE_ERROR; +} + +ECodeType CharsetDetector::Detect(const std::string_view& bytes) +{ + if (_icuin.IsAvailable()) { + UErrorCode status = U_ZERO_ERROR; + _csd = _icuin.ucsdet_open(&status); + if (status != U_ZERO_ERROR) { + return CODE_ERROR; + } + + _icuin.ucsdet_setText(_csd, bytes.data(), bytes.length(), &status); + if (status != U_ZERO_ERROR) { + return CODE_ERROR; + } + + const auto csm = _icuin.ucsdet_detect(_csd, &status); + if (status != U_ZERO_ERROR) { + return CODE_ERROR; + } + std::string_view name = _icuin.ucsdet_getName(csm, &status); + if (status != U_ZERO_ERROR) { + return CODE_ERROR; + } + return name2code(name); + } + if (_uchardet.IsAvailable()) { + if (!_ud) { + _ud = _uchardet.uchardet_new(); + } + if (!_ud) { + return CODE_ERROR; + } + int ret = _uchardet.uchardet_handle_data(_ud, bytes.data(), bytes.length()); + if (ret != 0) { + return CODE_ERROR; + } + _uchardet.uchardet_data_end(_ud); + std::string_view name = _uchardet.uchardet_get_charset(_ud); + auto code = name2code(name); + _uchardet.uchardet_reset(_ud); + return code; + } return CODE_ERROR; } diff --git a/sakura_core/charset/icu4c/CharsetDetector.h b/sakura_core/charset/icu4c/CharsetDetector.h index 23699a2601..1f36ff7d47 100644 --- a/sakura_core/charset/icu4c/CharsetDetector.h +++ b/sakura_core/charset/icu4c/CharsetDetector.h @@ -29,6 +29,7 @@ #include #include "extmodule/CIcu4cI18n.h" +#include "extmodule/CUchardet.h" /*! * @brief 文字コード検出クラス @@ -38,12 +39,15 @@ class CharsetDetector final CIcu4cI18n _icuin; UCharsetDetector* _csd; + CUchardet _uchardet; + uchardet_t _ud; + public: CharsetDetector() noexcept; ~CharsetDetector() noexcept; bool IsAvailable() const noexcept { - return _icuin.IsAvailable(); + return _icuin.IsAvailable() || _uchardet.IsAvailable(); } ECodeType Detect(const std::string_view& bytes); diff --git a/sakura_core/extmodule/CUchardet.cpp b/sakura_core/extmodule/CUchardet.cpp new file mode 100644 index 0000000000..5bcac38612 --- /dev/null +++ b/sakura_core/extmodule/CUchardet.cpp @@ -0,0 +1,58 @@ +/*! @file */ +/* + Copyright (C) 2018-2021, Sakura Editor Organization + + This software is provided 'as-is', without any express or implied + warranty. In no event will the authors be held liable for any damages + arising from the use of this software. + + Permission is granted to anyone to use this software for any purpose, + including commercial applications, and to alter it and redistribute it + freely, subject to the following restrictions: + + 1. The origin of this software must not be misrepresented; + you must not claim that you wrote the original software. + If you use this software in a product, an acknowledgment + in the product documentation would be appreciated but is + not required. + + 2. Altered source versions must be plainly marked as such, + and must not be misrepresented as being the original software. + + 3. This notice may not be removed or altered from any source + distribution. +*/ +#include "StdAfx.h" +#include "CUchardet.h" + +/*! + * @brief DLLの名前を返す + */ +LPCWSTR CUchardet::GetDllNameImp( [[maybe_unused]] int index ) +{ + return L"uchardet.dll"; +} + +/*! + DLLの初期化 + + 関数のアドレスを取得してメンバに保管する. + + @retval true 成功 + @retval false アドレス取得に失敗 +*/ +bool CUchardet::InitDllImp() +{ + // DLL内関数名リスト + const ImportTable table[] = { + { &_uchardet_new, "uchardet_new" }, + { &_uchardet_delete, "uchardet_delete" }, + { &_uchardet_handle_data, "uchardet_handle_data" }, + { &_uchardet_data_end, "uchardet_data_end" }, + { &_uchardet_reset, "uchardet_reset" }, + { &_uchardet_get_charset, "uchardet_get_charset" }, + { NULL, 0 } + }; + return RegisterEntries(table); +} + diff --git a/sakura_core/extmodule/CUchardet.h b/sakura_core/extmodule/CUchardet.h new file mode 100644 index 0000000000..cc3aee5ad7 --- /dev/null +++ b/sakura_core/extmodule/CUchardet.h @@ -0,0 +1,60 @@ +/*! @file */ +/* + Copyright (C) 2018-2021, Sakura Editor Organization + + This software is provided 'as-is', without any express or implied + warranty. In no event will the authors be held liable for any damages + arising from the use of this software. + + Permission is granted to anyone to use this software for any purpose, + including commercial applications, and to alter it and redistribute it + freely, subject to the following restrictions: + + 1. The origin of this software must not be misrepresented; + you must not claim that you wrote the original software. + If you use this software in a product, an acknowledgment + in the product documentation would be appreciated but is + not required. + + 2. Altered source versions must be plainly marked as such, + and must not be misrepresented as being the original software. + + 3. This notice may not be removed or altered from any source + distribution. +*/ +#pragma once + +#include "CDllHandler.h" + +typedef struct uchardet * uchardet_t; + +/*! + * uchardet ライブラリ(uchardet.dll) をラップするクラス + */ +class CUchardet final : public CDllImp +{ +public: + CUchardet() noexcept = default; + + // DLL関数ポインタ + uchardet_t (*_uchardet_new)(void) = nullptr; + void (*_uchardet_delete)(uchardet_t ud) = nullptr; + int (*_uchardet_handle_data)(uchardet_t ud, const char * data, size_t len) = nullptr; + void (*_uchardet_data_end)(uchardet_t ud) = nullptr; + void (*_uchardet_reset)(uchardet_t ud) = nullptr; + const char * (*_uchardet_get_charset)(uchardet_t ud) = nullptr; + +protected: + // CDllImpインタフェース + LPCWSTR GetDllNameImp(int nIndex) override; + bool InitDllImp() override; + +public: + uchardet_t uchardet_new(void) { return _uchardet_new(); } + void uchardet_delete(uchardet_t ud) { _uchardet_delete(ud); } + int uchardet_handle_data(uchardet_t ud, const char * data, size_t len) { return _uchardet_handle_data(ud, data, len); } + void uchardet_data_end(uchardet_t ud) { _uchardet_data_end(ud); } + void uchardet_reset(uchardet_t ud) { _uchardet_reset(ud); } + const char * uchardet_get_charset(uchardet_t ud) { return _uchardet_get_charset(ud); } +}; +