unchecked::replace_invalid()

Add unchecked version of replace_invalid function.
5 years ago · c523193025
3 changed files with 98 additions and 1 deletions
--- a/README.md
+++ b/README.md
@ -1195,6 +1195,43 @@ assert (utf32result.size() == 2);

 This is a faster but less safe version of `utf8::utf8to32`. It does not check for validity of the supplied UTF-8 sequence.

+#### utf8::unchecked::replace_invalid
+
+Available in version 3.1 and later.
+
+Replaces all invalid UTF-8 sequences within a string with a replacement marker.
+
+```cpp
+template <typename octet_iterator, typename output_iterator>
+output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, uint32_t replacement);
+template <typename octet_iterator, typename output_iterator>
+output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out);
+```
+
+`octet_iterator`: an input iterator.  
+`output_iterator`: an output iterator.  
+`start`: an iterator pointing to the beginning of the UTF-8 string to look for invalid UTF-8 sequences.  
+`end`: an iterator pointing to pass-the-end of the UTF-8 string to look for invalid UTF-8 sequences.  
+`out`: An output iterator to the range where the result of replacement is stored.  
+`replacement`: A Unicode code point for the replacement marker. The version without this parameter assumes the value `0xfffd`  
+Return value: An iterator pointing to the place after the UTF-8 string with replaced invalid sequences.
+
+Example of use:
+
+```cpp
+char invalid_sequence[] = "a\x80\xe0\xa0\xc0\xaf\xed\xa0\x80z";
+vector<char> replace_invalid_result;
+unchecked::replace_invalid (invalid_sequence, invalid_sequence + sizeof(invalid_sequence), back_inserter(replace_invalid_result), '?');
+bvalid = utf8::is_valid(replace_invalid_result.begin(), replace_invalid_result.end());
+assert (bvalid);
+char* fixed_invalid_sequence = "a????z";
+assert (std::equal(replace_invalid_result.begin(), replace_invalid_result.end(), fixed_invalid_sequence));
+```
+
+`replace_invalid` does not perform in-place replacement of invalid sequences. Rather, it produces a copy of the original string with the invalid sequences replaced with a replacement marker. Therefore, `out` must not be in the `[start, end]` range.
+
+Unlike `utf8::replace_invalid`, this function does not verify validity of the replacement marker.
+
 ### Types From utf8::unchecked Namespace

 #### utf8::iterator
@ -1215,12 +1252,19 @@ class iterator;
 `explicit iterator (const octet_iterator& octet_it);` a constructor that initializes the underlying octet_iterator with `octet_it`.

 `octet_iterator base () const;` returns the underlying octet_iterator.
+
 `uint32_t operator * () const;` decodes the utf-8 sequence the underlying octet_iterator is pointing to and returns the code point.
+
 `bool operator == (const iterator& rhs) const;` returns `true` if the two underlaying iterators are equal.
+
 `bool operator != (const iterator& rhs) const;` returns `true` if the two underlaying iterators are not equal.
+
 `iterator& operator ++ ();` the prefix increment - moves the iterator to the next UTF-8 encoded code point.
+
 `iterator operator ++ (int);` the postfix increment - moves the iterator to the next UTF-8 encoded code point and returns the current one.
+
 `iterator& operator -- ();` the prefix decrement - moves the iterator to the previous UTF-8 encoded code point.
+
 `iterator operator -- (int);` the postfix decrement - moves the iterator to the previous UTF-8 encoded code point and returns the current one.

 Example of use:
--- a/source/utf8/unchecked.h
+++ b/source/utf8/unchecked.h
@ -32,7 +32,7 @@ DEALINGS IN THE SOFTWARE.

 namespace utf8
 {
-    namespace unchecked 
+    namespace unchecked
    {
        template <typename octet_iterator>
        octet_iterator append(uint32_t cp, octet_iterator result)
@ -57,6 +57,46 @@ namespace utf8
            return result;
        }

+        template <typename octet_iterator, typename output_iterator>
+        output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, uint32_t replacement)
+        {
+            while (start != end) {
+                octet_iterator sequence_start = start;
+                internal::utf_error err_code = utf8::internal::validate_next(start, end);
+                switch (err_code) {
+                    case internal::UTF8_OK :
+                        for (octet_iterator it = sequence_start; it != start; ++it)
+                            *out++ = *it;
+                        break;
+                    case internal::NOT_ENOUGH_ROOM:
+                        out = utf8::unchecked::append (replacement, out);
+                        start = end;
+                        break;
+                    case internal::INVALID_LEAD:
+                        out = utf8::unchecked::append (replacement, out);
+                        ++start;
+                        break;
+                    case internal::INCOMPLETE_SEQUENCE:
+                    case internal::OVERLONG_SEQUENCE:
+                    case internal::INVALID_CODE_POINT:
+                        out = utf8::unchecked::append (replacement, out);
+                        ++start;
+                        // just one replacement mark for the sequence
+                        while (start != end && utf8::internal::is_trail(*start))
+                            ++start;
+                        break;
+                }
+            }
+            return out;
+        }
+
+        template <typename octet_iterator, typename output_iterator>
+        inline output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out)
+        {
+            static const uint32_t replacement_marker = utf8::internal::mask16(0xfffd);
+            return utf8::unchecked::replace_invalid(start, end, out, replacement_marker);
+        }
+
        template <typename octet_iterator>
        uint32_t next(octet_iterator& it)
        {
--- a/tests/test_unchecked_api.cpp
+++ b/tests/test_unchecked_api.cpp
@ -146,3 +146,16 @@ TEST(UnCheckedAPITests, test_utf8to16)
    EXPECT_EQ (utf16result[2], 0xd834);
    EXPECT_EQ (utf16result[3], 0xdd1e);
 }
+
+TEST(UnCheckedAPITests, test_replace_invalid)
+{
+    char invalid_sequence[] = "a\x80\xe0\xa0\xc0\xaf\xed\xa0\x80z";
+    vector<char> replace_invalid_result;
+    replace_invalid (invalid_sequence, invalid_sequence + sizeof(invalid_sequence), std::back_inserter(replace_invalid_result), '?');
+    bool bvalid = utf8::is_valid(replace_invalid_result.begin(), replace_invalid_result.end());
+    EXPECT_TRUE (bvalid);
+    const char fixed_invalid_sequence[] = "a????z";
+    EXPECT_EQ (sizeof(fixed_invalid_sequence), replace_invalid_result.size());
+    EXPECT_TRUE (std::equal(replace_invalid_result.begin(), replace_invalid_result.begin() + sizeof(fixed_invalid_sequence), fixed_invalid_sequence));
+}
+