|
| 1 | +// Copyright 2016 Wladimir J. van der Laan |
| 2 | +// Distributed under the MIT software license, see the accompanying |
| 3 | +// file COPYING or http://www.opensource.org/licenses/mit-license.php. |
| 4 | +#ifndef UNIVALUE_UTFFILTER_H |
| 5 | +#define UNIVALUE_UTFFILTER_H |
| 6 | + |
| 7 | +#include <string> |
| 8 | + |
| 9 | +/** |
| 10 | + * Filter that generates and validates UTF-8, as well as collates UTF-16 |
| 11 | + * surrogate pairs as specified in RFC4627. |
| 12 | + */ |
| 13 | +class JSONUTF8StringFilter |
| 14 | +{ |
| 15 | +public: |
| 16 | + JSONUTF8StringFilter(std::string &s): |
| 17 | + str(s), is_valid(true), codepoint(0), state(0), surpair(0) |
| 18 | + { |
| 19 | + } |
| 20 | + // Write single 8-bit char (may be part of UTF-8 sequence) |
| 21 | + void push_back(unsigned char ch) |
| 22 | + { |
| 23 | + if (state == 0) { |
| 24 | + if (ch < 0x80) // 7-bit ASCII, fast direct pass-through |
| 25 | + str.push_back(ch); |
| 26 | + else if (ch < 0xc0) // Mid-sequence character, invalid in this state |
| 27 | + is_valid = false; |
| 28 | + else if (ch < 0xe0) { // Start of 2-byte sequence |
| 29 | + codepoint = (ch & 0x1f) << 6; |
| 30 | + state = 6; |
| 31 | + } else if (ch < 0xf0) { // Start of 3-byte sequence |
| 32 | + codepoint = (ch & 0x0f) << 12; |
| 33 | + state = 12; |
| 34 | + } else if (ch < 0xf8) { // Start of 4-byte sequence |
| 35 | + codepoint = (ch & 0x07) << 18; |
| 36 | + state = 18; |
| 37 | + } else // Reserved, invalid |
| 38 | + is_valid = false; |
| 39 | + } else { |
| 40 | + if ((ch & 0xc0) != 0x80) // Not a continuation, invalid |
| 41 | + is_valid = false; |
| 42 | + state -= 6; |
| 43 | + codepoint |= (ch & 0x3f) << state; |
| 44 | + if (state == 0) |
| 45 | + push_back_u(codepoint); |
| 46 | + } |
| 47 | + } |
| 48 | + // Write codepoint directly, possibly collating surrogate pairs |
| 49 | + void push_back_u(unsigned int codepoint) |
| 50 | + { |
| 51 | + if (state) // Only accept full codepoints in open state |
| 52 | + is_valid = false; |
| 53 | + if (codepoint >= 0xD800 && codepoint < 0xDC00) { // First half of surrogate pair |
| 54 | + if (surpair) // Two subsequent surrogate pair openers - fail |
| 55 | + is_valid = false; |
| 56 | + else |
| 57 | + surpair = codepoint; |
| 58 | + } else if (codepoint >= 0xDC00 && codepoint < 0xE000) { // Second half of surrogate pair |
| 59 | + if (surpair) { // Open surrogate pair, expect second half |
| 60 | + // Compute code point from UTF-16 surrogate pair |
| 61 | + append_codepoint(0x10000 | ((surpair - 0xD800)<<10) | (codepoint - 0xDC00)); |
| 62 | + surpair = 0; |
| 63 | + } else // Second half doesn't follow a first half - fail |
| 64 | + is_valid = false; |
| 65 | + } else { |
| 66 | + if (surpair) // First half of surrogate pair not followed by second - fail |
| 67 | + is_valid = false; |
| 68 | + else |
| 69 | + append_codepoint(codepoint); |
| 70 | + } |
| 71 | + } |
| 72 | + // Check that we're in a state where the string can be ended |
| 73 | + // No open sequences, no open surrogate pairs, etc |
| 74 | + bool finalize() |
| 75 | + { |
| 76 | + if (state || surpair) |
| 77 | + is_valid = false; |
| 78 | + return is_valid; |
| 79 | + } |
| 80 | +private: |
| 81 | + std::string &str; |
| 82 | + bool is_valid; |
| 83 | + // Current UTF-8 decoding state |
| 84 | + unsigned int codepoint; |
| 85 | + int state; // Top bit to be filled in for next UTF-8 byte, or 0 |
| 86 | + |
| 87 | + // Keep track of the following state to handle the following section of |
| 88 | + // RFC4627: |
| 89 | + // |
| 90 | + // To escape an extended character that is not in the Basic Multilingual |
| 91 | + // Plane, the character is represented as a twelve-character sequence, |
| 92 | + // encoding the UTF-16 surrogate pair. So, for example, a string |
| 93 | + // containing only the G clef character (U+1D11E) may be represented as |
| 94 | + // "\uD834\uDD1E". |
| 95 | + // |
| 96 | + // Two subsequent \u.... may have to be replaced with one actual codepoint. |
| 97 | + unsigned int surpair; // First half of open UTF-16 surrogate pair, or 0 |
| 98 | + |
| 99 | + void append_codepoint(unsigned int codepoint) |
| 100 | + { |
| 101 | + if (codepoint <= 0x7f) |
| 102 | + str.push_back((char)codepoint); |
| 103 | + else if (codepoint <= 0x7FF) { |
| 104 | + str.push_back((char)(0xC0 | (codepoint >> 6))); |
| 105 | + str.push_back((char)(0x80 | (codepoint & 0x3F))); |
| 106 | + } else if (codepoint <= 0xFFFF) { |
| 107 | + str.push_back((char)(0xE0 | (codepoint >> 12))); |
| 108 | + str.push_back((char)(0x80 | ((codepoint >> 6) & 0x3F))); |
| 109 | + str.push_back((char)(0x80 | (codepoint & 0x3F))); |
| 110 | + } else if (codepoint <= 0x1FFFFF) { |
| 111 | + str.push_back((char)(0xF0 | (codepoint >> 18))); |
| 112 | + str.push_back((char)(0x80 | ((codepoint >> 12) & 0x3F))); |
| 113 | + str.push_back((char)(0x80 | ((codepoint >> 6) & 0x3F))); |
| 114 | + str.push_back((char)(0x80 | (codepoint & 0x3F))); |
| 115 | + } |
| 116 | + } |
| 117 | +}; |
| 118 | + |
| 119 | +#endif |
0 commit comments