diff --git a/README.md b/README.md index 3c63afc..4ec9a1b 100644 --- a/README.md +++ b/README.md @@ -1195,6 +1195,43 @@ assert (utf32result.size() == 2); This is a faster but less safe version of `utf8::utf8to32`. It does not check for validity of the supplied UTF-8 sequence. +#### utf8::unchecked::replace_invalid + +Available in version 3.1 and later. + +Replaces all invalid UTF-8 sequences within a string with a replacement marker. + +```cpp +template +output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, uint32_t replacement); +template +output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out); +``` + +`octet_iterator`: an input iterator. +`output_iterator`: an output iterator. +`start`: an iterator pointing to the beginning of the UTF-8 string to look for invalid UTF-8 sequences. +`end`: an iterator pointing to pass-the-end of the UTF-8 string to look for invalid UTF-8 sequences. +`out`: An output iterator to the range where the result of replacement is stored. +`replacement`: A Unicode code point for the replacement marker. The version without this parameter assumes the value `0xfffd` +Return value: An iterator pointing to the place after the UTF-8 string with replaced invalid sequences. + +Example of use: + +```cpp +char invalid_sequence[] = "a\x80\xe0\xa0\xc0\xaf\xed\xa0\x80z"; +vector replace_invalid_result; +unchecked::replace_invalid (invalid_sequence, invalid_sequence + sizeof(invalid_sequence), back_inserter(replace_invalid_result), '?'); +bvalid = utf8::is_valid(replace_invalid_result.begin(), replace_invalid_result.end()); +assert (bvalid); +char* fixed_invalid_sequence = "a????z"; +assert (std::equal(replace_invalid_result.begin(), replace_invalid_result.end(), fixed_invalid_sequence)); +``` + +`replace_invalid` does not perform in-place replacement of invalid sequences. Rather, it produces a copy of the original string with the invalid sequences replaced with a replacement marker. Therefore, `out` must not be in the `[start, end]` range. + +Unlike `utf8::replace_invalid`, this function does not verify validity of the replacement marker. + ### Types From utf8::unchecked Namespace #### utf8::iterator @@ -1215,12 +1252,19 @@ class iterator; `explicit iterator (const octet_iterator& octet_it);` a constructor that initializes the underlying octet_iterator with `octet_it`. `octet_iterator base () const;` returns the underlying octet_iterator. + `uint32_t operator * () const;` decodes the utf-8 sequence the underlying octet_iterator is pointing to and returns the code point. + `bool operator == (const iterator& rhs) const;` returns `true` if the two underlaying iterators are equal. + `bool operator != (const iterator& rhs) const;` returns `true` if the two underlaying iterators are not equal. + `iterator& operator ++ ();` the prefix increment - moves the iterator to the next UTF-8 encoded code point. + `iterator operator ++ (int);` the postfix increment - moves the iterator to the next UTF-8 encoded code point and returns the current one. + `iterator& operator -- ();` the prefix decrement - moves the iterator to the previous UTF-8 encoded code point. + `iterator operator -- (int);` the postfix decrement - moves the iterator to the previous UTF-8 encoded code point and returns the current one. Example of use: diff --git a/source/utf8/unchecked.h b/source/utf8/unchecked.h index c78419f..def0009 100644 --- a/source/utf8/unchecked.h +++ b/source/utf8/unchecked.h @@ -32,7 +32,7 @@ DEALINGS IN THE SOFTWARE. namespace utf8 { - namespace unchecked + namespace unchecked { template octet_iterator append(uint32_t cp, octet_iterator result) @@ -57,6 +57,46 @@ namespace utf8 return result; } + template + output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, uint32_t replacement) + { + while (start != end) { + octet_iterator sequence_start = start; + internal::utf_error err_code = utf8::internal::validate_next(start, end); + switch (err_code) { + case internal::UTF8_OK : + for (octet_iterator it = sequence_start; it != start; ++it) + *out++ = *it; + break; + case internal::NOT_ENOUGH_ROOM: + out = utf8::unchecked::append (replacement, out); + start = end; + break; + case internal::INVALID_LEAD: + out = utf8::unchecked::append (replacement, out); + ++start; + break; + case internal::INCOMPLETE_SEQUENCE: + case internal::OVERLONG_SEQUENCE: + case internal::INVALID_CODE_POINT: + out = utf8::unchecked::append (replacement, out); + ++start; + // just one replacement mark for the sequence + while (start != end && utf8::internal::is_trail(*start)) + ++start; + break; + } + } + return out; + } + + template + inline output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out) + { + static const uint32_t replacement_marker = utf8::internal::mask16(0xfffd); + return utf8::unchecked::replace_invalid(start, end, out, replacement_marker); + } + template uint32_t next(octet_iterator& it) { diff --git a/tests/test_unchecked_api.cpp b/tests/test_unchecked_api.cpp index fae6cb6..e9f19ca 100644 --- a/tests/test_unchecked_api.cpp +++ b/tests/test_unchecked_api.cpp @@ -146,3 +146,16 @@ TEST(UnCheckedAPITests, test_utf8to16) EXPECT_EQ (utf16result[2], 0xd834); EXPECT_EQ (utf16result[3], 0xdd1e); } + +TEST(UnCheckedAPITests, test_replace_invalid) +{ + char invalid_sequence[] = "a\x80\xe0\xa0\xc0\xaf\xed\xa0\x80z"; + vector replace_invalid_result; + replace_invalid (invalid_sequence, invalid_sequence + sizeof(invalid_sequence), std::back_inserter(replace_invalid_result), '?'); + bool bvalid = utf8::is_valid(replace_invalid_result.begin(), replace_invalid_result.end()); + EXPECT_TRUE (bvalid); + const char fixed_invalid_sequence[] = "a????z"; + EXPECT_EQ (sizeof(fixed_invalid_sequence), replace_invalid_result.size()); + EXPECT_TRUE (std::equal(replace_invalid_result.begin(), replace_invalid_result.begin() + sizeof(fixed_invalid_sequence), fixed_invalid_sequence)); +} +