diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..85e9430 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "extern/gtest"] + path = extern/gtest + url = git@github.com:google/googletest.git diff --git a/CMakeLists.txt b/CMakeLists.txt index f560e2d..33ac45d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,8 +6,8 @@ option(UTF8_SAMPLES "Enable building samples for UTF8-CPP" On) add_library(utf8cpp INTERFACE) target_include_directories(utf8cpp INTERFACE - "$" - $ + "$" + $ ) add_library(utf8::cpp ALIAS utf8cpp) @@ -23,22 +23,12 @@ install(TARGETS utf8cpp EXPORT utf8cppConfig) install(EXPORT utf8cppConfig DESTINATION ${DEF_INSTALL_CMAKE_DIR}) if(UTF8_SAMPLES) - add_executable(docsample ${PROJECT_SOURCE_DIR}/samples/docsample.cpp) - - target_link_libraries(docsample PRIVATE utf8::cpp) + add_executable(docsample ${PROJECT_SOURCE_DIR}/samples/docsample.cpp) + target_link_libraries(docsample PRIVATE utf8::cpp) endif() if(UTF8_TESTS) - add_executable(smoke ${PROJECT_SOURCE_DIR}/test_drivers/smoke_test/test.cpp) - add_executable(cpp11 ${PROJECT_SOURCE_DIR}/test_drivers/smoke_test/cpp11.cpp) - add_executable(negative ${PROJECT_SOURCE_DIR}/test_drivers/negative/negative.cpp) - - target_link_libraries(smoke PRIVATE utf8::cpp) - target_link_libraries(cpp11 PRIVATE utf8::cpp) - target_link_libraries(negative PRIVATE utf8::cpp) - - enable_testing() - add_test(smoke_test smoke) - add_test(cpp11_test cpp11) - add_test(negative_test negative ${PROJECT_SOURCE_DIR}/test_data/negative/utf8_invalid.txt) + enable_testing() + add_subdirectory(extern/gtest) + add_subdirectory(tests) endif() diff --git a/extern/gtest b/extern/gtest new file mode 160000 index 0000000..2fe3bd9 --- /dev/null +++ b/extern/gtest @@ -0,0 +1 @@ +Subproject commit 2fe3bd994b3189899d93f1d5a881e725e046fdc2 diff --git a/source/utf8/cpp11.h b/source/utf8/cpp11.h index 64b3f29..c8a8803 100644 --- a/source/utf8/cpp11.h +++ b/source/utf8/cpp11.h @@ -30,7 +30,6 @@ DEALINGS IN THE SOFTWARE. #include "checked.h" #include -#include namespace utf8 { diff --git a/test_drivers/smoke_test/cpp11.cpp b/test_drivers/smoke_test/cpp11.cpp deleted file mode 100644 index 645820f..0000000 --- a/test_drivers/smoke_test/cpp11.cpp +++ /dev/null @@ -1,78 +0,0 @@ -#include "../../source/utf8.h" -using namespace utf8; -using namespace std; - -int main() -{ - string u; - #if __cplusplus >= 201103L // C++ 11 or later - //append - - append(0x0448, u); - assert (u[0] == char(0xd1) && u[1] == char(0x88) && u.length() == 2); - - u.clear(); - append(0x65e5, u); - assert (u[0] == char(0xe6) && u[1] == char(0x97) && u[2] == char(0xa5) && u.length() == 3); - - u.clear(); - append(0x3044, u); - assert (u[0] == char(0xe3) && u[1] == char(0x81) && u[2] == char(0x84) && u.length() == 3); - - u.clear(); - append(0x10346, u); - assert (u[0] == char(0xf0) && u[1] == char(0x90) && u[2] == char(0x8d) && u[3] == char(0x86) && u.length() == 4); - - //utf16to8 - u16string utf16string = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e}; - u.clear(); - u = utf16to8(utf16string); - assert (u.size() == 10); - - //utf8to16 - string utf8_with_surrogates = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e"; - u16string utf16result = utf8to16(utf8_with_surrogates); - assert (utf16result.length() == 4); - assert (utf16result[2] == 0xd834); - assert (utf16result[3] == 0xdd1e); - - // utf32to8 - u32string utf32string = {0x448, 0x65E5, 0x10346}; - string utf8result = utf32to8(utf32string); - assert (utf8result.size() == 9); - - // utf8to32 - const char* twochars = "\xe6\x97\xa5\xd1\x88"; - u32string utf32result = utf8to32(twochars); - assert (utf32result.size() == 2); - - //find_invalid - string utf_invalid = "\xe6\x97\xa5\xd1\x88\xfa"; - auto invalid = find_invalid(utf_invalid); - assert (invalid == 5); - - //is_valid - bool bvalid = is_valid(utf_invalid); - assert (bvalid == false); - bvalid = is_valid(utf8_with_surrogates); - assert (bvalid == true); - - //replace_invalid - string invalid_sequence = "a\x80\xe0\xa0\xc0\xaf\xed\xa0\x80z"; - string replace_invalid_result = replace_invalid(invalid_sequence, '?'); - bvalid = is_valid(replace_invalid_result); - assert (bvalid); - const string fixed_invalid_sequence = "a????z"; - assert (fixed_invalid_sequence == replace_invalid_result); - - //starts_with_bom - string byte_order_mark = {char(0xef), char(0xbb), char(0xbf)}; - bool bbom = starts_with_bom(byte_order_mark); - assert (bbom == true); - string threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88"; - bool no_bbom = starts_with_bom(threechars); - assert (no_bbom == false); - - -#endif // C++ 11 or later -} diff --git a/test_drivers/smoke_test/test.cpp b/test_drivers/smoke_test/test.cpp deleted file mode 100644 index 42f4cc8..0000000 --- a/test_drivers/smoke_test/test.cpp +++ /dev/null @@ -1,273 +0,0 @@ -#include -#include -#include -#include "../../source/utf8.h" -using namespace utf8; -using namespace std; - -int main() -{ - //append - unsigned char u[5] = {0,0,0,0,0}; - - append(0x0448, u); - assert (u[0] == 0xd1 && u[1] == 0x88 && u[2] == 0 && u[3] == 0 && u[4] == 0); - - append(0x65e5, u); - assert (u[0] == 0xe6 && u[1] == 0x97 && u[2] == 0xa5 && u[3] == 0 && u[4] == 0); - - append(0x3044, u); - assert (u[0] == 0xe3 && u[1] == 0x81 && u[2] == 0x84 && u[3] == 0 && u[4] == 0); - - append(0x10346, u); - assert (u[0] == 0xf0 && u[1] == 0x90 && u[2] == 0x8d && u[3] == 0x86 && u[4] == 0); - - - //next - const char* twochars = "\xe6\x97\xa5\xd1\x88"; - const char* w = twochars; - int cp = next(w, twochars + 6); - assert (cp == 0x65e5); - assert (w == twochars + 3); - - const char* threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88"; - w = threechars; - cp = next(w, threechars + 9); - assert (cp == 0x10346); - assert (w == threechars + 4); - cp = next(w, threechars + 9); - assert (cp == 0x65e5); - assert (w == threechars + 7); - cp = next(w, threechars + 9); - assert (cp == 0x0448); - assert (w == threechars + 9); - - //peek_next - const char* const cw = twochars; - cp = peek_next(cw, cw + 6); - assert (cp == 0x65e5); - assert (cw == twochars); - - //prior - w = twochars + 3; - cp = prior (w, twochars); - assert (cp == 0x65e5); - assert (w == twochars); - - w = threechars + 9; - cp = prior(w, threechars); - assert (cp == 0x0448); - assert (w == threechars + 7); - cp = prior(w, threechars); - assert (cp == 0x65e5); - assert (w == threechars + 4); - cp = prior(w, threechars); - assert (cp == 0x10346); - assert (w == threechars); - - // advance - w = threechars; - advance(w, 2, threechars + 9); - assert(w == threechars + 7); - advance(w, -2, threechars); - assert(w == threechars); - advance(w, 3, threechars + 9); - assert(w == threechars + 9); - advance(w, -2, threechars); - assert(w == threechars + 4); - advance(w, -1, threechars); - assert(w == threechars); - - // distance - size_t dist = utf8::distance(twochars, twochars + 5); - assert (dist == 2); - - // utf32to8 - int utf32string[] = {0x448, 0x65E5, 0x10346, 0}; - vector utf8result; - utf32to8(utf32string, utf32string + 3, back_inserter(utf8result)); - assert (utf8result.size() == 9); - // try it with the return value; - char* utf8_end = utf32to8(utf32string, utf32string + 3, &utf8result[0]); - assert (utf8_end == &utf8result[0] + 9); - - //utf8to32 - vector utf32result; - utf8to32(twochars, twochars + 5, back_inserter(utf32result)); - assert (utf32result.size() == 2); - // try it with the return value; - int* utf32_end = utf8to32(twochars, twochars + 5, &utf32result[0]); - assert (utf32_end == &utf32result[0] + 2); - - //utf16to8 - unsigned short utf16string[] = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e}; - utf8result.clear(); - utf16to8(utf16string, utf16string + 5, back_inserter(utf8result)); - assert (utf8result.size() == 10); - // try it with the return value; - utf8_end = utf16to8 (utf16string, utf16string + 5, &utf8result[0]); - assert (utf8_end == &utf8result[0] + 10); - - //utf8to16 - char utf8_with_surrogates[] = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e"; - vector utf16result; - utf8to16(utf8_with_surrogates, utf8_with_surrogates + 9, back_inserter(utf16result)); - assert (utf16result.size() == 4); - assert (utf16result[2] == 0xd834); - assert (utf16result[3] == 0xdd1e); - // try it with the return value; - unsigned short* utf16_end = utf8to16 (utf8_with_surrogates, utf8_with_surrogates + 9, &utf16result[0]); - assert (utf16_end == &utf16result[0] + 4); - - //find_invalid - char utf_invalid[] = "\xe6\x97\xa5\xd1\x88\xfa"; - char* invalid = find_invalid(utf_invalid, utf_invalid + 6); - assert (invalid == utf_invalid + 5); - - //is_valid - bool bvalid = is_valid(utf_invalid, utf_invalid + 6); - assert (bvalid == false); - bvalid = is_valid(utf8_with_surrogates, utf8_with_surrogates + 9); - assert (bvalid == true); - - //starts_with_bom - unsigned char byte_order_mark[] = {0xef, 0xbb, 0xbf}; - bool bbom = starts_with_bom(byte_order_mark, byte_order_mark + sizeof(byte_order_mark)); - assert (bbom == true); - bool no_bbom = starts_with_bom(threechars, threechars + sizeof(threechars)); - assert (no_bbom == false); - - //replace_invalid - char invalid_sequence[] = "a\x80\xe0\xa0\xc0\xaf\xed\xa0\x80z"; - vector replace_invalid_result; - replace_invalid (invalid_sequence, invalid_sequence + sizeof(invalid_sequence), std::back_inserter(replace_invalid_result), '?'); - bvalid = is_valid(replace_invalid_result.begin(), replace_invalid_result.end()); - assert (bvalid); - const char fixed_invalid_sequence[] = "a????z"; - assert (sizeof(fixed_invalid_sequence) == replace_invalid_result.size()); - assert (std::equal(replace_invalid_result.begin(), replace_invalid_result.begin() + sizeof(fixed_invalid_sequence), fixed_invalid_sequence)); - - // iterator - utf8::iterator it(threechars, threechars, threechars + 9); - utf8::iterator it2 = it; - assert (it2 == it); - assert (*it == 0x10346); - assert (*(++it) == 0x65e5); - assert ((*it++) == 0x65e5); - assert (*it == 0x0448); - assert (it != it2); - utf8::iterator endit (threechars + 9, threechars, threechars + 9); - assert (++it == endit); - assert (*(--it) == 0x0448); - assert ((*it--) == 0x0448); - assert (*it == 0x65e5); - assert (--it == utf8::iterator(threechars, threechars, threechars + 9)); - assert (*it == 0x10346); - - ////////////////////////////////////////////////////////// - //// Unchecked variants - ////////////////////////////////////////////////////////// - - //append - memset(u, 0, 5); - append(0x0448, u); - assert (u[0] == 0xd1 && u[1] == 0x88 && u[2] == 0 && u[3] == 0 && u[4] == 0); - - append(0x65e5, u); - assert (u[0] == 0xe6 && u[1] == 0x97 && u[2] == 0xa5 && u[3] == 0 && u[4] == 0); - - append(0x10346, u); - assert (u[0] == 0xf0 && u[1] == 0x90 && u[2] == 0x8d && u[3] == 0x86 && u[4] == 0); - - //next - w = twochars; - cp = unchecked::next(w); - assert (cp == 0x65e5); - assert (w == twochars + 3); - - w = threechars; - cp = unchecked::next(w); - assert (cp == 0x10346); - assert (w == threechars + 4); - cp = unchecked::next(w); - assert (cp == 0x65e5); - assert (w == threechars + 7); - cp = unchecked::next(w); - assert (cp == 0x0448); - assert (w == threechars + 9); - - //peek_next - cp = unchecked::peek_next(cw); - assert (cp == 0x65e5); - assert (cw == twochars); - - // advance - w = threechars; - unchecked::advance(w, 2); - assert(w == threechars + 7); - unchecked::advance(w, -2); - assert(w == threechars); - unchecked::advance(w, 3); - assert(w == threechars + 9); - unchecked::advance(w, -2); - assert(w == threechars + 4); - unchecked::advance(w, -1); - assert(w == threechars); - - // distance - dist = unchecked::distance(twochars, twochars + 5); - assert (dist == 2); - - // utf32to8 - utf8result.clear(); - unchecked::utf32to8(utf32string, utf32string + 3, back_inserter(utf8result)); - assert (utf8result.size() == 9); - // try it with the return value; - utf8_end = utf32to8(utf32string, utf32string + 3, &utf8result[0]); - assert(utf8_end == &utf8result[0] + 9); - - //utf8to32 - utf32result.clear(); - unchecked::utf8to32(twochars, twochars + 5, back_inserter(utf32result)); - assert (utf32result.size() == 2); - // try it with the return value; - utf32_end = utf8to32(twochars, twochars + 5, &utf32result[0]); - assert (utf32_end == &utf32result[0] + 2); - - //utf16to8 - utf8result.clear(); - unchecked::utf16to8(utf16string, utf16string + 5, back_inserter(utf8result)); - assert (utf8result.size() == 10); - // try it with the return value; - utf8_end = utf16to8 (utf16string, utf16string + 5, &utf8result[0]); - assert (utf8_end == &utf8result[0] + 10); - - //utf8to16 - utf16result.clear(); - unchecked::utf8to16(utf8_with_surrogates, utf8_with_surrogates + 9, back_inserter(utf16result)); - assert (utf16result.size() == 4); - assert (utf16result[2] == 0xd834); - assert (utf16result[3] == 0xdd1e); - // try it with the return value; - utf16_end = utf8to16 (utf8_with_surrogates, utf8_with_surrogates + 9, &utf16result[0]); - assert (utf16_end == &utf16result[0] + 4); - - // iterator - utf8::unchecked::iterator un_it(threechars); - utf8::unchecked::iterator un_it2 = un_it; - assert (un_it2 == un_it); - assert (*un_it == 0x10346); - assert (*(++un_it) == 0x65e5); - assert ((*un_it++) == 0x65e5); - assert (un_it != un_it2); - assert (*un_it == 0x0448); - utf8::unchecked::iterator un_endit (threechars + 9); - assert (++un_it == un_endit); - assert (*(--un_it) == 0x0448); - assert ((*un_it--) == 0x0448); - assert (*un_it == 0x65e5); - assert (--un_it == utf8::unchecked::iterator(threechars)); - assert (*un_it == 0x10346); -} - - diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt new file mode 100644 index 0000000..5a2cbd2 --- /dev/null +++ b/tests/CMakeLists.txt @@ -0,0 +1,22 @@ +add_executable(negative ${PROJECT_SOURCE_DIR}/tests/negative.cpp) +add_executable(cpp11 ${PROJECT_SOURCE_DIR}/tests/test_cpp11.cpp) +add_executable(apitests + ${PROJECT_SOURCE_DIR}/tests/test_checked_api.cpp + ${PROJECT_SOURCE_DIR}/tests/test_unchecked_api.cpp + ${PROJECT_SOURCE_DIR}/tests/test_checked_iterator.cpp + ${PROJECT_SOURCE_DIR}/tests/test_unchecked_iterator.cpp +) + +target_link_libraries(negative PRIVATE utf8::cpp) +target_link_libraries(cpp11 PRIVATE + utf8::cpp + gtest_main + ) +target_link_libraries(apitests PRIVATE + utf8::cpp + gtest_main +) + +add_test(negative_test negative ${PROJECT_SOURCE_DIR}/tests/test_data/utf8_invalid.txt) +add_test(cpp11_test cpp11) +add_test(api_test apitests) diff --git a/test_drivers/negative/negative.cpp b/tests/negative.cpp similarity index 88% rename from test_drivers/negative/negative.cpp rename to tests/negative.cpp index 0f1015d..f1bcc99 100644 --- a/test_drivers/negative/negative.cpp +++ b/tests/negative.cpp @@ -1,4 +1,4 @@ -#include "../../source/utf8.h" +#include "utf8.h" using namespace utf8; #include @@ -17,13 +17,13 @@ int main(int argc, char** argv) test_file_path = argv[1]; else { cout << "Wrong number of arguments" << endl; - exit(0); + return 1; } // Open the test file ifstream fs8(test_file_path.c_str()); if (!fs8.is_open()) { cout << "Could not open " << test_file_path << endl; - return 0; + return 1; } // Read it line by line @@ -38,16 +38,22 @@ int main(int argc, char** argv) bool expected_valid = (find(INVALID_LINES, INVALID_LINES_END, line_count) == INVALID_LINES_END); // Print out lines that contain unexpected invalid UTF-8 if (!is_valid(line.begin(), line.end())) { - if (expected_valid) + if (expected_valid) { cout << "Unexpected invalid utf-8 at line " << line_count << '\n'; + return 1; + } // try fixing it: string fixed_line; replace_invalid(line.begin(), line.end(), back_inserter(fixed_line)); - if (!is_valid(fixed_line.begin(), fixed_line.end())) + if (!is_valid(fixed_line.begin(), fixed_line.end())) { cout << "replace_invalid() resulted in an invalid utf-8 at line " << line_count << '\n'; + return 1; + } } - else if (!expected_valid) + else if (!expected_valid) { cout << "Invalid utf-8 NOT detected at line " << line_count << '\n'; + return 1; + } } } diff --git a/tests/test_checked_api.cpp b/tests/test_checked_api.cpp new file mode 100644 index 0000000..c378815 --- /dev/null +++ b/tests/test_checked_api.cpp @@ -0,0 +1,188 @@ +#include "gtest/gtest.h" +#include "utf8/checked.h" + +#include +#include +using namespace utf8; +using namespace std; + + +TEST(CheckedAPITests, test_append) +{ + unsigned char u[5] = {0,0,0,0,0}; + append(0x0448, u); + EXPECT_EQ (u[0], 0xd1); + EXPECT_EQ (u[1], 0x88); + EXPECT_EQ (u[2], 0); + EXPECT_EQ (u[3], 0); + EXPECT_EQ (u[4], 0); + + append(0x65e5, u); + EXPECT_EQ (u[0], 0xe6); + EXPECT_EQ (u[1], 0x97); + EXPECT_EQ (u[2], 0xa5); + EXPECT_EQ (u[3], 0); + EXPECT_EQ (u[4], 0); + + append(0x3044, u); + EXPECT_EQ (u[0], 0xe3); + EXPECT_EQ (u[1], 0x81); + EXPECT_EQ (u[2], 0x84); + EXPECT_EQ (u[3], 0); + EXPECT_EQ (u[4], 0); + + append(0x10346, u); + EXPECT_EQ (u[0], 0xf0); + EXPECT_EQ (u[1], 0x90); + EXPECT_EQ (u[2], 0x8d); + EXPECT_EQ (u[3], 0x86); + EXPECT_EQ (u[4], 0); +} + +TEST(CheckedAPITests, test_next) +{ + const char* twochars = "\xe6\x97\xa5\xd1\x88"; + const char* w = twochars; + int cp = next(w, twochars + 6); + EXPECT_EQ (cp, 0x65e5); + EXPECT_EQ (w, twochars + 3); + + const char* threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88"; + w = threechars; + + cp = next(w, threechars + 9); + EXPECT_EQ (cp, 0x10346); + EXPECT_EQ (w, threechars + 4); + + cp = next(w, threechars + 9); + EXPECT_EQ (cp, 0x65e5); + EXPECT_EQ (w, threechars + 7); + + cp = next(w, threechars + 9); + EXPECT_EQ (cp, 0x0448); + EXPECT_EQ (w, threechars + 9); +} + +TEST(CheckedAPITests, test_peek_next) +{ + const char* const cw = "\xe6\x97\xa5\xd1\x88"; + int cp = peek_next(cw, cw + 6); + EXPECT_EQ (cp, 0x65e5); +} + +TEST(CheckedAPITests, test_prior) +{ + const char* twochars = "\xe6\x97\xa5\xd1\x88"; + const char* w = twochars + 3; + int cp = prior (w, twochars); + EXPECT_EQ (cp, 0x65e5); + EXPECT_EQ (w, twochars); + + const char* threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88"; + w = threechars + 9; + cp = prior(w, threechars); + EXPECT_EQ (cp, 0x0448); + EXPECT_EQ (w, threechars + 7); + cp = prior(w, threechars); + EXPECT_EQ (cp, 0x65e5); + EXPECT_EQ (w, threechars + 4); + cp = prior(w, threechars); + EXPECT_EQ (cp, 0x10346); + EXPECT_EQ (w, threechars); +} + +TEST(CheckedAPITests, test_advance) +{ + const char* threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88"; + const char* w = threechars; + advance(w, 2, threechars + 9); + EXPECT_EQ(w, threechars + 7); + advance(w, -2, threechars); + EXPECT_EQ(w, threechars); + advance(w, 3, threechars + 9); + EXPECT_EQ(w, threechars + 9); + advance(w, -2, threechars); + EXPECT_EQ(w, threechars + 4); + advance(w, -1, threechars); + EXPECT_EQ(w, threechars); +} + +TEST(CheckedAPITests, test_distance) +{ + const char* twochars = "\xe6\x97\xa5\xd1\x88"; + size_t dist = utf8::distance(twochars, twochars + 5); + EXPECT_EQ (dist, 2); +} + +TEST(CheckedAPITests, test_utf32to8) +{ + int utf32string[] = {0x448, 0x65E5, 0x10346, 0}; + string utf8result; + utf32to8(utf32string, utf32string + 3, back_inserter(utf8result)); + EXPECT_EQ (utf8result.size(), 9); +} + +TEST(CheckedAPITests, test_utf8to32) +{ + const char* twochars = "\xe6\x97\xa5\xd1\x88"; + vector utf32result; + utf8to32(twochars, twochars + 5, back_inserter(utf32result)); + EXPECT_EQ (utf32result.size(), 2); +} + +TEST(CheckedAPITests, test_utf16to8) +{ + unsigned short utf16string[] = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e}; + string utf8result; + utf16to8(utf16string, utf16string + 5, back_inserter(utf8result)); + EXPECT_EQ (utf8result.size(), 10); +} + +TEST(CheckedAPITests, test_utf8to16) +{ + char utf8_with_surrogates[] = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e"; + vector utf16result; + utf8to16(utf8_with_surrogates, utf8_with_surrogates + 9, back_inserter(utf16result)); + EXPECT_EQ (utf16result.size(), 4); + EXPECT_EQ (utf16result[2], 0xd834); + EXPECT_EQ (utf16result[3], 0xdd1e); +} + +TEST(CheckedAPITests, test_replace_invalid) +{ + char invalid_sequence[] = "a\x80\xe0\xa0\xc0\xaf\xed\xa0\x80z"; + vector replace_invalid_result; + replace_invalid (invalid_sequence, invalid_sequence + sizeof(invalid_sequence), std::back_inserter(replace_invalid_result), '?'); + bool bvalid = is_valid(replace_invalid_result.begin(), replace_invalid_result.end()); + EXPECT_TRUE (bvalid); + const char fixed_invalid_sequence[] = "a????z"; + EXPECT_EQ (sizeof(fixed_invalid_sequence), replace_invalid_result.size()); + EXPECT_TRUE (std::equal(replace_invalid_result.begin(), replace_invalid_result.begin() + sizeof(fixed_invalid_sequence), fixed_invalid_sequence)); +} + +TEST(CheckedAPITests, test_find_invalid) +{ + char utf_invalid[] = "\xe6\x97\xa5\xd1\x88\xfa"; + char* invalid = find_invalid(utf_invalid, utf_invalid + 6); + EXPECT_EQ (invalid, utf_invalid + 5); +} + +TEST(CheckedAPITests, test_is_valid) +{ + char utf_invalid[] = "\xe6\x97\xa5\xd1\x88\xfa"; + bool bvalid = is_valid(utf_invalid, utf_invalid + 6); + EXPECT_FALSE (bvalid); + char utf8_with_surrogates[] = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e"; + bvalid = is_valid(utf8_with_surrogates, utf8_with_surrogates + 9); + EXPECT_TRUE (bvalid); +} + +TEST(CheckedAPITests, test_starts_with_bom) +{ + unsigned char byte_order_mark[] = {0xef, 0xbb, 0xbf}; + bool bbom = starts_with_bom(byte_order_mark, byte_order_mark + sizeof(byte_order_mark)); + EXPECT_TRUE (bbom); + const char* threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88"; + bool no_bbom = starts_with_bom(threechars, threechars + sizeof(threechars)); + EXPECT_FALSE (no_bbom); +} diff --git a/tests/test_checked_iterator.cpp b/tests/test_checked_iterator.cpp new file mode 100644 index 0000000..21c39b1 --- /dev/null +++ b/tests/test_checked_iterator.cpp @@ -0,0 +1,31 @@ +#include "gtest/gtest.h" +#include "utf8/checked.h" + +using namespace utf8; + + +TEST(CheckedIteratrTests, test_increment) +{ + const char* threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88"; + utf8::iterator it(threechars, threechars, threechars + 9); + utf8::iterator it2 = it; + EXPECT_EQ (it2, it); + EXPECT_EQ (*it, 0x10346); + EXPECT_EQ (*(++it), 0x65e5); + EXPECT_EQ ((*it++), 0x65e5); + EXPECT_EQ (*it, 0x0448); + EXPECT_NE (it, it2); + utf8::iterator endit (threechars + 9, threechars, threechars + 9); + EXPECT_EQ (++it, endit); +} + +TEST(CheckedIteratrTests, test_decrement) +{ + const char* threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88"; + utf8::iterator it(threechars+9, threechars, threechars + 9); + EXPECT_EQ (*(--it), 0x0448); + EXPECT_EQ ((*it--), 0x0448); + EXPECT_EQ (*it, 0x65e5); + EXPECT_EQ (--it, utf8::iterator(threechars, threechars, threechars + 9)); + EXPECT_EQ (*it, 0x10346); +} diff --git a/tests/test_cpp11.cpp b/tests/test_cpp11.cpp new file mode 100644 index 0000000..edcff9d --- /dev/null +++ b/tests/test_cpp11.cpp @@ -0,0 +1,106 @@ +#include "gtest/gtest.h" +#include "utf8.h" +#include +using namespace utf8; +using namespace std; + +#if __cplusplus >= 201103L // C++ 11 or later + +TEST(CPP11APITests, test_append) +{ + string u; + append(0x0448, u); + EXPECT_EQ (u[0], char(0xd1)); + EXPECT_EQ (u[1], char(0x88)); + EXPECT_EQ (u.length(), 2); + + u.clear(); + append(0x65e5, u); + EXPECT_EQ (u[0], char(0xe6)); + EXPECT_EQ (u[1], char(0x97)); + EXPECT_EQ (u[2], char(0xa5)); + EXPECT_EQ (u.length(), 3); + + u.clear(); + append(0x3044, u); + EXPECT_EQ (u[0], char(0xe3)); + EXPECT_EQ (u[1], char(0x81)); + EXPECT_EQ (u[2], char(0x84)); + EXPECT_EQ (u.length(), 3); + + u.clear(); + append(0x10346, u); + EXPECT_EQ (u[0], char(0xf0)); + EXPECT_EQ (u[1], char(0x90)); + EXPECT_EQ (u[2], char(0x8d)); + EXPECT_EQ (u[3], char(0x86)); + EXPECT_EQ (u.length(), 4); +} + +TEST(CPP11APITests, test_utf16to8) +{ + u16string utf16string = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e}; + string u = utf16to8(utf16string); + EXPECT_EQ (u.size(), 10); +} + +TEST(CPP11APITests, test_utf8to16) +{ + string utf8_with_surrogates = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e"; + u16string utf16result = utf8to16(utf8_with_surrogates); + EXPECT_EQ (utf16result.size(), 4); + EXPECT_EQ (utf16result[2], 0xd834); + EXPECT_EQ (utf16result[3], 0xdd1e); +} + +TEST(CPP11APITests, test_utf32to8) +{ + u32string utf32string = {0x448, 0x65E5, 0x10346}; + string utf8result = utf32to8(utf32string); + EXPECT_EQ (utf8result.size(), 9); +} + +TEST(CPP11APITests, test_utf8to32) +{ + const char* twochars = "\xe6\x97\xa5\xd1\x88"; + u32string utf32result = utf8to32(twochars); + EXPECT_EQ (utf32result.size(), 2); +} + +TEST(CPP11APITests, test_find_invalid) +{ + string utf_invalid = "\xe6\x97\xa5\xd1\x88\xfa"; + auto invalid = find_invalid(utf_invalid); + EXPECT_EQ (invalid, 5); +} + +TEST(CPP11APITests, test_is_valid) +{ + string utf_invalid = "\xe6\x97\xa5\xd1\x88\xfa"; + bool bvalid = is_valid(utf_invalid); + EXPECT_FALSE (bvalid); + string utf8_with_surrogates = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e"; + bvalid = is_valid(utf8_with_surrogates); + EXPECT_TRUE (bvalid); +} + +TEST(CPP11APITests, test_replace_invalid) +{ + string invalid_sequence = "a\x80\xe0\xa0\xc0\xaf\xed\xa0\x80z"; + string replace_invalid_result = replace_invalid(invalid_sequence, '?'); + bool bvalid = is_valid(replace_invalid_result); + EXPECT_TRUE (bvalid); + const string fixed_invalid_sequence = "a????z"; + EXPECT_EQ(fixed_invalid_sequence, replace_invalid_result); +} + +TEST(CPP11APITests, test_starts_with_bom) +{ + string byte_order_mark = {char(0xef), char(0xbb), char(0xbf)}; + bool bbom = starts_with_bom(byte_order_mark); + EXPECT_TRUE (bbom); + string threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88"; + bool no_bbom = starts_with_bom(threechars); + EXPECT_FALSE (no_bbom); +} +#endif // C++ 11 or later diff --git a/test_data/negative/utf8_invalid.txt b/tests/test_data/utf8_invalid.txt similarity index 100% rename from test_data/negative/utf8_invalid.txt rename to tests/test_data/utf8_invalid.txt diff --git a/tests/test_unchecked_api.cpp b/tests/test_unchecked_api.cpp new file mode 100644 index 0000000..fae6cb6 --- /dev/null +++ b/tests/test_unchecked_api.cpp @@ -0,0 +1,148 @@ +#include "gtest/gtest.h" +#include "utf8/unchecked.h" + +#include +#include +using namespace utf8::unchecked; +using namespace std; + +TEST(UnCheckedAPITests, test_append) +{ + unsigned char u[5] = {0,0,0,0,0}; + append(0x0448, u); + EXPECT_EQ (u[0], 0xd1); + EXPECT_EQ (u[1], 0x88); + EXPECT_EQ (u[2], 0); + EXPECT_EQ (u[3], 0); + EXPECT_EQ (u[4], 0); + + append(0x65e5, u); + EXPECT_EQ (u[0], 0xe6); + EXPECT_EQ (u[1], 0x97); + EXPECT_EQ (u[2], 0xa5); + EXPECT_EQ (u[3], 0); + EXPECT_EQ (u[4], 0); + + append(0x3044, u); + EXPECT_EQ (u[0], 0xe3); + EXPECT_EQ (u[1], 0x81); + EXPECT_EQ (u[2], 0x84); + EXPECT_EQ (u[3], 0); + EXPECT_EQ (u[4], 0); + + append(0x10346, u); + EXPECT_EQ (u[0], 0xf0); + EXPECT_EQ (u[1], 0x90); + EXPECT_EQ (u[2], 0x8d); + EXPECT_EQ (u[3], 0x86); + EXPECT_EQ (u[4], 0); +} + +TEST(UnCheckedAPITests, test_next) +{ + const char* twochars = "\xe6\x97\xa5\xd1\x88"; + const char* w = twochars; + int cp = utf8::unchecked::next(w); + EXPECT_EQ (cp, 0x65e5); + EXPECT_EQ (w, twochars + 3); + + const char* threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88"; + w = threechars; + + cp = utf8::unchecked::next(w); + EXPECT_EQ (cp, 0x10346); + EXPECT_EQ (w, threechars + 4); + + cp = utf8::unchecked::next(w); + EXPECT_EQ (cp, 0x65e5); + EXPECT_EQ (w, threechars + 7); + + cp = utf8::unchecked::next(w); + EXPECT_EQ (cp, 0x0448); + EXPECT_EQ (w, threechars + 9); +} + +TEST(UnCheckedAPITests, test_peek_next) +{ + const char* const cw = "\xe6\x97\xa5\xd1\x88"; + int cp = peek_next(cw); + EXPECT_EQ (cp, 0x65e5); +} + +TEST(UnCheckedAPITests, test_prior) +{ + const char* twochars = "\xe6\x97\xa5\xd1\x88"; + const char* w = twochars + 3; + int cp = prior (w); + EXPECT_EQ (cp, 0x65e5); + EXPECT_EQ (w, twochars); + + const char* threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88"; + w = threechars + 9; + cp = prior(w); + EXPECT_EQ (cp, 0x0448); + EXPECT_EQ (w, threechars + 7); + cp = prior(w); + EXPECT_EQ (cp, 0x65e5); + EXPECT_EQ (w, threechars + 4); + cp = prior(w); + EXPECT_EQ (cp, 0x10346); + EXPECT_EQ (w, threechars); +} + +TEST(UnCheckedAPITests, test_advance) +{ + const char* threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88"; + const char* w = threechars; + utf8::unchecked::advance(w, 2); + EXPECT_EQ(w, threechars + 7); + utf8::unchecked::advance(w, -2); + EXPECT_EQ(w, threechars); + utf8::unchecked::advance(w, 3); + EXPECT_EQ(w, threechars + 9); + utf8::unchecked::advance(w, -2); + EXPECT_EQ(w, threechars + 4); + utf8::unchecked::advance(w, -1); + EXPECT_EQ(w, threechars); +} + +TEST(UnCheckedAPITests, test_distance) +{ + const char* twochars = "\xe6\x97\xa5\xd1\x88"; + size_t dist = utf8::unchecked::distance(twochars, twochars + 5); + EXPECT_EQ (dist, 2); +} + +TEST(UnCheckedAPITests, test_utf32to8) +{ + int utf32string[] = {0x448, 0x65E5, 0x10346, 0}; + string utf8result; + utf32to8(utf32string, utf32string + 3, back_inserter(utf8result)); + EXPECT_EQ (utf8result.size(), 9); +} + +TEST(UnCheckedAPITests, test_utf8to32) +{ + const char* twochars = "\xe6\x97\xa5\xd1\x88"; + vector utf32result; + utf8to32(twochars, twochars + 5, back_inserter(utf32result)); + EXPECT_EQ (utf32result.size(), 2); +} + +TEST(UnCheckedAPITests, test_utf16to8) +{ + unsigned short utf16string[] = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e}; + string utf8result; + utf16to8(utf16string, utf16string + 5, back_inserter(utf8result)); + EXPECT_EQ (utf8result.size(), 10); +} + +TEST(UnCheckedAPITests, test_utf8to16) +{ + char utf8_with_surrogates[] = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e"; + vector utf16result; + utf8to16(utf8_with_surrogates, utf8_with_surrogates + 9, back_inserter(utf16result)); + EXPECT_EQ (utf16result.size(), 4); + EXPECT_EQ (utf16result[2], 0xd834); + EXPECT_EQ (utf16result[3], 0xdd1e); +} diff --git a/tests/test_unchecked_iterator.cpp b/tests/test_unchecked_iterator.cpp new file mode 100644 index 0000000..103e8e2 --- /dev/null +++ b/tests/test_unchecked_iterator.cpp @@ -0,0 +1,32 @@ +#include "gtest/gtest.h" +#include "utf8/unchecked.h" + +using namespace utf8::unchecked; + + +TEST(UnCheckedIteratrTests, test_increment) +{ + const char* threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88"; + utf8::unchecked::iterator it(threechars); + utf8::unchecked::iterator it2 = it; + EXPECT_EQ (it2, it); + EXPECT_EQ (*it, 0x10346); + EXPECT_EQ (*(++it), 0x65e5); + EXPECT_EQ ((*it++), 0x65e5); + EXPECT_EQ (*it, 0x0448); + EXPECT_NE (it, it2); + utf8::unchecked::iterator endit (threechars + 9); + EXPECT_EQ (++it, endit); +} + +TEST(UnCheckedIteratrTests, test_decrement) +{ + const char* threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88"; + utf8::unchecked::iterator it(threechars+9); + EXPECT_EQ (*(--it), 0x0448); + EXPECT_EQ ((*it--), 0x0448); + EXPECT_EQ (*it, 0x65e5); + EXPECT_EQ (--it, utf8::unchecked::iterator(threechars)); + EXPECT_EQ (*it, 0x10346); + +}