Open Chinese Convert 1.3.0.dirty
A project for conversion between Traditional and Simplified Chinese
Loading...
Searching...
No Matches
UTF8Util.hpp
1/*
2 * Open Chinese Convert
3 *
4 * Copyright 2013 Carbo Kuo <byvoid@byvoid.com>
5 *
6 * Licensed under the Apache License, Version 2.0 (the "License");
7 * you may not use this file except in compliance with the License.
8 * You may obtain a copy of the License at
9 *
10 * http://www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing, software
13 * distributed under the License is distributed on an "AS IS" BASIS,
14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 * See the License for the specific language governing permissions and
16 * limitations under the License.
17 */
18
19#pragma once
20
21#ifdef _MSC_VER
22#ifndef NOMINMAX
23#define NOMINMAX
24#endif
25#include <Windows.h>
26#endif // _MSC_VER
27
28#include <cstring>
29
30#include "Common.hpp"
31#include "Exception.hpp"
32
33namespace opencc {
38class OPENCC_EXPORT UTF8Util {
39public:
43 static void SkipUtf8Bom(FILE* fp);
44
49 static size_t NextCharLengthNoException(const char* str) {
50 char ch = *str;
51 if ((ch & 0xF0) == 0xE0) {
52 return 3;
53 } else if ((ch & 0x80) == 0x00) {
54 return 1;
55 } else if ((ch & 0xE0) == 0xC0) {
56 return 2;
57 } else if ((ch & 0xF8) == 0xF0) {
58 return 4;
59 } else if ((ch & 0xFC) == 0xF8) {
60 return 5;
61 } else if ((ch & 0xFE) == 0xFC) {
62 return 6;
63 }
64 return 0;
65 }
66
70 static size_t NextCharLength(const char* str) {
71 size_t length = NextCharLengthNoException(str);
72 if (length == 0) {
73 throw InvalidUTF8(str);
74 }
75 return length;
76 }
77
81 static size_t PrevCharLength(const char* str) {
82 {
83 const size_t length = NextCharLengthNoException(str - 1);
84 if (length == 1) {
85 return length;
86 }
87 }
88 {
89 const size_t length = NextCharLengthNoException(str - 2);
90 if (length == 2) {
91 return length;
92 }
93 }
94 {
95 const size_t length = NextCharLengthNoException(str - 3);
96 if (length == 3) {
97 return length;
98 }
99 }
100 for (size_t i = 4; i <= 6; i++) {
101 const size_t length = NextCharLengthNoException(str - i);
102 if (length == i) {
103 return length;
104 }
105 }
106 throw InvalidUTF8(str);
107 }
108
112 static const char* NextChar(const char* str) {
113 return str + NextCharLength(str);
114 }
115
119 static const char* PrevChar(const char* str) {
120 return str - PrevCharLength(str);
121 }
122
129 static size_t Length(const char* str) {
130 size_t length = 0;
131 while (*str != '\0') {
132 const size_t charLen = NextCharLengthNoException(str);
133 if (charLen == 0) {
134 throw InvalidUTF8(str);
135 }
136 // Verify all continuation bytes are present before the null terminator.
137 // Use a while loop (not a for-with-return) to avoid complex control flow
138 // that triggers MSVC LTCG code-generator bugs.
139 size_t i = 1;
140 while (i < charLen && str[i] != '\0') {
141 ++i;
142 }
143 if (i < charLen) {
144 throw InvalidUTF8(str); // Truncated sequence: throw, don't silently skip
145 }
146 str += charLen;
147 ++length;
148 }
149 return length;
150 }
151
158 static const char* FindNextInline(const char* str, const char ch) {
159 while (!IsLineEndingOrFileEnding(*str) && *str != ch) {
160 str = NextChar(str);
161 }
162 return str;
163 }
164
168 static bool IsLineEndingOrFileEnding(const char ch) {
169 return ch == '\0' || ch == '\n' || ch == '\r';
170 }
171
175 static std::string FromSubstr(const char* str, size_t length) {
176 std::string newStr;
177 newStr.resize(length);
178 strncpy(newStr.data(), str, length);
179 return newStr;
180 }
181
186 static bool NotShorterThan(const char* str, size_t byteLength) {
187 while (byteLength > 0) {
188 if (*str == '\0') {
189 return false;
190 }
191 byteLength--;
192 str++;
193 }
194 return true;
195 }
196
201 static std::string TruncateUTF8(const char* str, size_t maxByteLength) {
202 std::string wordTrunc;
203 if (NotShorterThan(str, maxByteLength)) {
204 size_t len = 0;
205 const char* pStr = str;
206 for (;;) {
207 const size_t charLength = NextCharLength(pStr);
208 if (len + charLength > maxByteLength) {
209 break;
210 }
211 pStr += charLength;
212 len += charLength;
213 }
214 wordTrunc = FromSubstr(str, len);
215 } else {
216 wordTrunc = str;
217 }
218 return wordTrunc;
219 }
220
224 static void ReplaceAll(std::string& str, const char* from, const char* to) {
225 std::string::size_type pos = 0;
226 std::string::size_type fromLen = strlen(from);
227 std::string::size_type toLen = strlen(to);
228 while ((pos = str.find(from, pos)) != std::string::npos) {
229 str.replace(pos, fromLen, to);
230 pos += toLen;
231 }
232 }
233
237 static std::string Join(const std::vector<std::string>& strings,
238 const std::string& separator) {
239 std::ostringstream buffer;
240 bool first = true;
241 for (const auto& str : strings) {
242 if (!first) {
243 buffer << separator;
244 }
245 buffer << str;
246 first = false;
247 }
248 return buffer.str();
249 }
250
254 static std::string Join(const std::vector<std::string>& strings) {
255 std::ostringstream buffer;
256 for (const auto& str : strings) {
257 buffer << str;
258 }
259 return buffer.str();
260 }
261
262 static void GetByteMap(const char* str, const size_t utf8Length,
263 std::vector<size_t>* byteMap) {
264 if (byteMap->size() < utf8Length) {
265 byteMap->resize(utf8Length);
266 }
267 const char* pstr = str;
268 for (size_t i = 0; i < utf8Length; i++) {
269 (*byteMap)[i] = pstr - str;
270 pstr = NextChar(pstr);
271 }
272 }
273
274#ifdef _MSC_VER
275 static std::wstring GetPlatformString(const std::string& str) {
276 return U8ToU16(str);
277 }
278#else
279 static std::string GetPlatformString(const std::string& str) { return str; }
280#endif // _MSC_VER
281
282#ifdef _MSC_VER
283 static std::string U16ToU8(const std::wstring& wstr) {
284 std::string ret;
285 int length = static_cast<int>(wstr.length());
286 int convcnt = WideCharToMultiByte(CP_UTF8, 0, wstr.c_str(), length, NULL, 0,
287 NULL, NULL);
288 if (convcnt > 0) {
289 ret.resize(convcnt);
290 WideCharToMultiByte(CP_UTF8, 0, wstr.c_str(), length, &ret[0], convcnt,
291 NULL, NULL);
292 }
293 return ret;
294 }
295
296 static std::wstring U8ToU16(const std::string& str) {
297 std::wstring ret;
298 int length = static_cast<int>(str.length());
299 int convcnt = MultiByteToWideChar(CP_UTF8, 0, str.c_str(), length, NULL, 0);
300 if (convcnt > 0) {
301 ret.resize(convcnt);
302 MultiByteToWideChar(CP_UTF8, 0, str.c_str(), length, &ret[0], convcnt);
303 }
304 return ret;
305 }
306#endif // _MSC_VER
307};
308} // namespace opencc
Definition Exception.hpp:77
UTF8 std::string utilities.
Definition UTF8Util.hpp:38
static bool IsLineEndingOrFileEnding(const char ch)
Returns true if the character is a line ending or end of file.
Definition UTF8Util.hpp:168
static size_t PrevCharLength(const char *str)
Returns the length in byte for the previous UTF8 character.
Definition UTF8Util.hpp:81
static std::string FromSubstr(const char *str, size_t length)
Copies a substring with given length to a new string.
Definition UTF8Util.hpp:175
static void ReplaceAll(std::string &str, const char *from, const char *to)
Replaces all patterns in a std::string in place.
Definition UTF8Util.hpp:224
static void SkipUtf8Bom(FILE *fp)
Detect UTF8 BOM and skip it.
Definition UTF8Util.cpp:23
static size_t NextCharLengthNoException(const char *str)
Returns the length in byte for the next UTF8 character.
Definition UTF8Util.hpp:49
static bool NotShorterThan(const char *str, size_t byteLength)
Returns true if the given std::string is longer or as long as the given length.
Definition UTF8Util.hpp:186
static std::string Join(const std::vector< std::string > &strings)
Joins a std::string vector in to a std::string.
Definition UTF8Util.hpp:254
static std::string TruncateUTF8(const char *str, size_t maxByteLength)
Truncates a std::string with a maximal length in byte.
Definition UTF8Util.hpp:201
static size_t Length(const char *str)
Returns the UTF8 length of a null-terminated string.
Definition UTF8Util.hpp:129
static const char * FindNextInline(const char *str, const char ch)
Finds a character in the same line.
Definition UTF8Util.hpp:158
static size_t NextCharLength(const char *str)
Returns the length in byte for the next UTF8 character.
Definition UTF8Util.hpp:70
static std::string Join(const std::vector< std::string > &strings, const std::string &separator)
Joins a std::string vector in to a std::string with a separator.
Definition UTF8Util.hpp:237
static const char * PrevChar(const char *str)
Move the char* pointer before the previous UTF8 character.
Definition UTF8Util.hpp:119
static const char * NextChar(const char *str)
Returns the char* pointer over the next UTF8 character.
Definition UTF8Util.hpp:112