From fd1cf49e926e2c56dacb794f70a04c1901fb8e33 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sat, 27 Feb 2021 22:22:38 +0300 Subject: [PATCH 1/6] Rewrite extractTextFromHTML function --- docker/test/fasttest/run.sh | 1 - src/Functions/extractTextFromHTML.cpp | 306 +++++++++ src/Functions/htmlOrXmlCoarseParse.cpp | 582 ------------------ src/Functions/registerFunctionsString.cpp | 13 +- .../01674_htm_xml_coarse_parse.sql | 13 +- .../01746_extract_text_from_html.reference | 106 ++++ .../01746_extract_text_from_html.sql | 61 ++ 7 files changed, 485 insertions(+), 597 deletions(-) create mode 100644 src/Functions/extractTextFromHTML.cpp delete mode 100644 src/Functions/htmlOrXmlCoarseParse.cpp create mode 100644 tests/queries/0_stateless/01746_extract_text_from_html.reference create mode 100644 tests/queries/0_stateless/01746_extract_text_from_html.sql diff --git a/docker/test/fasttest/run.sh b/docker/test/fasttest/run.sh index 370311b13c5..1bfc91ecd92 100755 --- a/docker/test/fasttest/run.sh +++ b/docker/test/fasttest/run.sh @@ -356,7 +356,6 @@ function run_tests # JSON functions 01666_blns - 01674_htm_xml_coarse_parse ) (time clickhouse-test --hung-check -j 8 --order=random --use-skip-list --no-long --testname --shard --zookeeper --skip "${TESTS_TO_SKIP[@]}" -- "$FASTTEST_FOCUS" 2>&1 ||:) | ts '%Y-%m-%d %H:%M:%S' | tee "$FASTTEST_OUTPUT/test_log.txt" diff --git a/src/Functions/extractTextFromHTML.cpp b/src/Functions/extractTextFromHTML.cpp new file mode 100644 index 00000000000..5bee4dc541f --- /dev/null +++ b/src/Functions/extractTextFromHTML.cpp @@ -0,0 +1,306 @@ +#include +#include +#include +#include +#include +#include + + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int ILLEGAL_COLUMN; + extern const int ILLEGAL_TYPE_OF_ARGUMENT; +} + +namespace +{ + +ALWAYS_INLINE bool startsWith(const char * s, const char * end, const char * prefix) +{ + return s + strlen(prefix) < end && 0 == memcmp(s, prefix, strlen(prefix)); +} + +ALWAYS_INLINE bool checkAndSkip(const char * __restrict & s, const char * end, const char * prefix) +{ + if (startsWith(s, end, prefix)) + { + s += strlen(prefix); + return true; + } + return false; +} + +bool processComment(const char * __restrict & src, const char * end) +{ + if (!checkAndSkip(src, end, "world'); +Helloworld +SELECT extractTextFromHTML('Helloworld'); +Helloworld +SELECT extractTextFromHTML('Helloworld'); +Helloworld +SELECT extractTextFromHTML('Hello World'); +Hello World +SELECT extractTextFromHTML('Hello World'); +Hello World +SELECT extractTextFromHTML('HelloWorld'); +HelloWorld +SELECT extractTextFromHTML('Hello World'); +Hello World +SELECT extractTextFromHTML('Hello World'); +Hello World +SELECT extractTextFromHTML('Hello World'); +Hello World +SELECT extractTextFromHTML('HelloWorld'); +HelloWorld +SELECT extractTextFromHTML('Hello World'); +Hello World +SELECT extractTextFromHTML(''); + \t Hello,\rworld \n +SELECT extractTextFromHTML('Hello world!'); +Hello Hello\tworld world! +SELECT extractTextFromHTML('Helloworld!'); +HelloHello\tworldworld! +SELECT extractTextFromHTML('Hello world]]> world!'); +Hello Hello world world! +SELECT extractTextFromHTML('John Smith]]>'); +John Smith +SELECT extractTextFromHTML('John ]]>'); +John +SELECT extractTextFromHTML('John Smith]]>'); +John +SELECT extractTextFromHTML('John ]]>]]>'); +John Smith +SELECT extractTextFromHTML('John ]]> ]]>'); +John Smith +SELECT extractTextFromHTML('John]]> ]]>'); +John Smith +SELECT extractTextFromHTML('John ]]>]]>]]>'); +John ]]>Smith +SELECT extractTextFromHTML('Hello goodbye'); +Hello goodbye +SELECT extractTextFromHTML('Hello goodbye'); +Hello goodbye +SELECT extractTextFromHTML('HelloWorld goodbye'); +HelloWorld goodbye +SELECT extractTextFromHTML('Hello goodbye'); +Hello goodbye +SELECT extractTextFromHTML('Hello goodbye'); +Hello goodbye +SELECT extractTextFromHTML('HelloWorld goodbye'); +HelloWorld goodbye +SELECT extractTextFromHTML('HelloWorld goodbye'); +HelloWorld goodbye +SELECT extractTextFromHTML('Hello goodbye'); +Hello goodbye +SELECT extractTextFromHTML('Hello goodbye'); +Hello goodbye +SELECT extractTextFromHTML('Hello]]> goodbye'); +Hello]]> goodbye +SELECT extractTextFromHTML('Hello goodbye'); +Hello goodbye diff --git a/tests/queries/0_stateless/01746_extract_text_from_html.sql b/tests/queries/0_stateless/01746_extract_text_from_html.sql new file mode 100644 index 00000000000..9bdd153228f --- /dev/null +++ b/tests/queries/0_stateless/01746_extract_text_from_html.sql @@ -0,0 +1,61 @@ +-- { echo } + +SELECT extractTextFromHTML(''); +SELECT extractTextFromHTML(' '); +SELECT extractTextFromHTML(' '); +SELECT extractTextFromHTML('Hello'); +SELECT extractTextFromHTML('Hello, world'); +SELECT extractTextFromHTML('Hello, world'); +SELECT extractTextFromHTML(' Hello, world'); +SELECT extractTextFromHTML(' Hello, world '); +SELECT extractTextFromHTML(' \t Hello,\rworld \n '); + +SELECT extractTextFromHTML('Hello world'); +SELECT extractTextFromHTML('Hello'); +SELECT extractTextFromHTML('Hello<>world'); +SELECT extractTextFromHTML('Helloworld'); +SELECT extractTextFromHTML('Helloworld'); +SELECT extractTextFromHTML('Helloworld'); +SELECT extractTextFromHTML('Helloworld'); +SELECT extractTextFromHTML('Helloworld'); + +SELECT extractTextFromHTML('Hello World'); +SELECT extractTextFromHTML('Hello World'); +SELECT extractTextFromHTML('HelloWorld'); +SELECT extractTextFromHTML('Hello World'); +SELECT extractTextFromHTML('Hello World'); +SELECT extractTextFromHTML('Hello World'); +SELECT extractTextFromHTML('HelloWorld'); +SELECT extractTextFromHTML('Hello World'); + +SELECT extractTextFromHTML(''); +SELECT extractTextFromHTML('Hello world!'); +SELECT extractTextFromHTML('Helloworld!'); + +SELECT extractTextFromHTML('Hello world]]> world!'); +SELECT extractTextFromHTML('John Smith]]>'); +SELECT extractTextFromHTML('John ]]>'); +SELECT extractTextFromHTML('John Smith]]>'); +SELECT extractTextFromHTML('John ]]>]]>'); +SELECT extractTextFromHTML('John ]]> ]]>'); +SELECT extractTextFromHTML('John]]> ]]>'); +SELECT extractTextFromHTML('John ]]>]]>]]>'); + +SELECT extractTextFromHTML('Hello goodbye'); +SELECT extractTextFromHTML('Hello goodbye'); +SELECT extractTextFromHTML('HelloWorld goodbye'); +SELECT extractTextFromHTML('Hello goodbye'); +SELECT extractTextFromHTML('Hello goodbye'); +SELECT extractTextFromHTML('HelloWorld goodbye'); +SELECT extractTextFromHTML('HelloWorld goodbye'); + +SELECT extractTextFromHTML('Hello goodbye'); +SELECT extractTextFromHTML('Hello goodbye'); +SELECT extractTextFromHTML('Hello]]> goodbye'); +SELECT extractTextFromHTML('Hello goodbye'); From 2ac673b12a9f02a36136263abd873159e28e4de8 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sat, 27 Feb 2021 22:33:55 +0300 Subject: [PATCH 2/6] Update logic and tests --- src/Functions/extractTextFromHTML.cpp | 22 +++++-- .../01674_htm_xml_coarse_parse.reference | 2 +- .../01746_extract_text_from_html.reference | 57 +------------------ .../01746_extract_text_from_html.sql | 1 + 4 files changed, 22 insertions(+), 60 deletions(-) diff --git a/src/Functions/extractTextFromHTML.cpp b/src/Functions/extractTextFromHTML.cpp index 5bee4dc541f..c6a9b84b33e 100644 --- a/src/Functions/extractTextFromHTML.cpp +++ b/src/Functions/extractTextFromHTML.cpp @@ -61,7 +61,7 @@ bool processCDATA(const char * __restrict & src, const char * end, char * __rest if (!checkAndSkip(src, end, "Hello, world world goodbye'); Hello goodbye -SELECT extractTextFromHTML('HelloWorld goodbye'); HelloWorld goodbye -SELECT extractTextFromHTML('HelloWorld goodbye'); HelloWorld goodbye -SELECT extractTextFromHTML('Hello goodbye'); Hello goodbye -SELECT extractTextFromHTML('Hello goodbye'); Hello goodbye -SELECT extractTextFromHTML('Hello]]> goodbye'); -Hello]]> goodbye -SELECT extractTextFromHTML('Hello goodbye'); +Hello goodbye +Hello Hello goodbye diff --git a/tests/queries/0_stateless/01746_extract_text_from_html.sql b/tests/queries/0_stateless/01746_extract_text_from_html.sql index 9bdd153228f..0004849df87 100644 --- a/tests/queries/0_stateless/01746_extract_text_from_html.sql +++ b/tests/queries/0_stateless/01746_extract_text_from_html.sql @@ -57,5 +57,6 @@ SELECT extractTextFromHTML('Hello]]> goodbye'); +SELECT extractTextFromHTML('Hello]]> goodbye'); SELECT extractTextFromHTML('Hello goodbye'); From 0ab4afeeed567b4626b45bd7c7b984c085a6916b Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sun, 28 Feb 2021 00:55:56 +0300 Subject: [PATCH 3/6] Tests and documentation --- .../01674_htm_xml_coarse_parse.reference | 2 +- .../01746_extract_text_from_html.reference | 107 ++++++++++++++---- .../01746_extract_text_from_html.sql | 10 ++ 3 files changed, 98 insertions(+), 21 deletions(-) diff --git a/tests/queries/0_stateless/01674_htm_xml_coarse_parse.reference b/tests/queries/0_stateless/01674_htm_xml_coarse_parse.reference index 72af13aedd0..9cca4934551 100644 --- a/tests/queries/0_stateless/01674_htm_xml_coarse_parse.reference +++ b/tests/queries/0_stateless/01674_htm_xml_coarse_parse.reference @@ -2,7 +2,7 @@ Here is CDTATA. This is a white space test. -This is a complex test. world goodbye'); Hello goodbye -HelloWorld goodbye -HelloWorld goodbye +SELECT extractTextFromHTML('HelloWorld goodbye'); +Hello World goodbye +SELECT extractTextFromHTML('HelloWorld goodbye'); +Hello World goodbye +SELECT extractTextFromHTML('Hello goodbye'); Hello goodbye +SELECT extractTextFromHTML('Hello goodbye'); Hello goodbye +SELECT extractTextFromHTML('Hello]]> goodbye'); Hello +SELECT extractTextFromHTML('Hello]]> goodbye'); Hello goodbye -Hello +SELECT extractTextFromHTML('Hello]]> goodbye'); +Hello ]]> goodbye +SELECT extractTextFromHTML('Hello goodbye'); Hello goodbye +SELECT extractTextFromHTML('Hello goodbye'); +Hello goodbye +SELECT extractTextFromHTML(']]>'); +]]> +SELECT extractTextFromHTML(' + +
xkcd.com + +'); +xkcd.com diff --git a/tests/queries/0_stateless/01746_extract_text_from_html.sql b/tests/queries/0_stateless/01746_extract_text_from_html.sql index 0004849df87..b4ccc775bef 100644 --- a/tests/queries/0_stateless/01746_extract_text_from_html.sql +++ b/tests/queries/0_stateless/01746_extract_text_from_html.sql @@ -58,5 +58,15 @@ SELECT extractTextFromHTML('Hello]]> goodbye'); SELECT extractTextFromHTML('Hello]]> goodbye'); +SELECT extractTextFromHTML('Hello]]> goodbye'); +SELECT extractTextFromHTML('Hello goodbye'); SELECT extractTextFromHTML('Hello goodbye'); + +SELECT extractTextFromHTML(']]>'); + +SELECT extractTextFromHTML(' + +
xkcd.com + +'); From 4ab18cdcd8a5eff3f4e386a86361a60f61222e23 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sun, 28 Feb 2021 00:59:27 +0300 Subject: [PATCH 4/6] Tests and documentation --- src/Functions/extractTextFromHTML.cpp | 92 +++++++++++++++++++-------- 1 file changed, 65 insertions(+), 27 deletions(-) diff --git a/src/Functions/extractTextFromHTML.cpp b/src/Functions/extractTextFromHTML.cpp index c6a9b84b33e..4b35eacaef0 100644 --- a/src/Functions/extractTextFromHTML.cpp +++ b/src/Functions/extractTextFromHTML.cpp @@ -6,6 +6,58 @@ #include +/** A function to extract text from HTML or XHTML. + * It does not necessarily 100% conforms to any of the HTML, XML or XHTML standards, + * but the implementation is reasonably accurate and it is fast. + * + * The rules are the following: + * + * 1. Comments are skipped. Example: + * Comment must end with -->. Nested comments are not possible. + * Note: constructions like are not valid comments in HTML but will be skipped by other rules. + * + * 2. CDATA is pasted verbatim. + * Note: CDATA is XML/XHTML specific. But we still process it for "best-effort" approach. + * + * 3. 'script' and 'style' elements are removed with all their content. + * Note: it's assumed that closing tag cannot appear inside content. + * For example, in JS string literal is has to be escaped as "<\/script>". + * Note: comments and CDATA is possible inside script or style - then closing tags are not searched inside CDATA. + * Example: ]]> + * But still searched inside comments. Sometimes it becomes complicated: + * var y = "-->"; alert(x + y); + * Note: script and style can be the names of XML namespaces - then they are not treat like usual script or style. + * Example: Hello. + * Note: whitespaces are possible after closing tag name: but not before: < / script>. + * + * 4. Other tags or tag-like elements are skipped without inner content. + * Example: . + * Note: it's expected that this HTML is illegal: + * Note: it will also skip something like tags: <>, , etc. + * Note: tag without end will be skipped to the end of input: + * 5. HTML and XML entities are not decoded. + * It should be processed by separate function. + * + * 6. Whitespaces in text are collapsed or inserted by specific rules. + * Whitespaces at beginning and at the end are removed. + * Consecutive whitespaces are collapsed. + * But if text is separated by other elements and there is no whitespace, it is inserted. + * It may be unnatural, examples: Helloworld, Helloworld + * - in HTML there will be no whitespace, but the function will insert it. + * But also consider: Hello

world

, Hello
world. + * This behaviour is reasonable for data analysis, e.g. convert HTML to a bag of words. + * + * 7. Also note that correct handling of whitespaces would require + * support of
 and CSS display and white-space properties.
+  *
+  * Usage example:
+  *
+  * SELECT extractTextFromHTML(html) FROM url('https://yandex.ru/', RawBLOB, 'html String')
+  *
+  * - ClickHouse has embedded web browser.
+  */
+
 namespace DB
 {
 
@@ -56,18 +108,11 @@ bool processComment(const char * __restrict & src, const char * end)
     return true;
 }
 
-bool processCDATA(const char * __restrict & src, const char * end, char * __restrict & dst, bool & pending_whitespace)
+bool processCDATA(const char * __restrict & src, const char * end, char * __restrict & dst)
 {
     if (!checkAndSkip(src, end, "(src, end);
+
+    if (needs_whitespace && src < lt)
     {
-        pending_whitespace = false;
         *dst = ' ';
         ++dst;
     }
 
-    const char * lt = find_first_symbols<'<'>(src, end);
-
     while (true)
     {
         const char * ws = find_first_symbols<' ', '\t', '\n', '\r', '\f', '\v'>(src, lt);
@@ -204,10 +245,7 @@ void copyText(const char * __restrict & src, const char * end, char * __restrict
 
         src = ws;
         while (src < lt && isWhitespaceASCII(*src))
-        {
-            pending_whitespace = true;
             ++src;
-        }
 
         if (src < lt)
         {
@@ -232,16 +270,16 @@ size_t extract(const char * __restrict src, size_t size, char * __restrict dst)
       * - CDATA should be copied verbatim;
       */
 
-    char * dst_begin = dst;
     const char * end = src + size;
-    bool pending_whitespace = false;
+    char * dst_begin = dst;
 
     while (src < end)
     {
-        copyText(src, end, dst, pending_whitespace);
+        bool needs_whitespace = dst != dst_begin && dst[-1] != ' ';
+        copyText(src, end, dst, needs_whitespace);
 
         processComment(src, end)
-            || processCDATA(src, end, dst, pending_whitespace)
+            || processCDATA(src, end, dst)
             || processElementAndSkipContent(src, end, "script")
             || processElementAndSkipContent(src, end, "style")
             || skipTag(src, end);

From e5ae9cbb6365dcf2122672e6587a95f19ebbd187 Mon Sep 17 00:00:00 2001
From: Alexey Milovidov 
Date: Sun, 28 Feb 2021 04:03:22 +0300
Subject: [PATCH 5/6] Fix Arcadia

---
 src/Functions/ya.make | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Functions/ya.make b/src/Functions/ya.make
index 20ba5f846a3..f8beaa8540c 100644
--- a/src/Functions/ya.make
+++ b/src/Functions/ya.make
@@ -246,6 +246,7 @@ SRCS(
     extractAllGroupsHorizontal.cpp
     extractAllGroupsVertical.cpp
     extractGroups.cpp
+    extractTextFromHTML.cpp
     extractTimeZoneFromFunctionArguments.cpp
     filesystem.cpp
     finalizeAggregation.cpp
@@ -291,7 +292,6 @@ SRCS(
     hasToken.cpp
     hasTokenCaseInsensitive.cpp
     hostName.cpp
-    htmlOrXmlCoarseParse.cpp
     hypot.cpp
     identity.cpp
     if.cpp

From ae9fea1d0af118a8f87b224d194d61da1567188b Mon Sep 17 00:00:00 2001
From: Alexey Milovidov 
Date: Sun, 28 Feb 2021 04:05:04 +0300
Subject: [PATCH 6/6] Fix gcc and clang-tidy

---
 src/Functions/extractTextFromHTML.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/Functions/extractTextFromHTML.cpp b/src/Functions/extractTextFromHTML.cpp
index 4b35eacaef0..528bd0c311f 100644
--- a/src/Functions/extractTextFromHTML.cpp
+++ b/src/Functions/extractTextFromHTML.cpp
@@ -70,12 +70,12 @@ namespace ErrorCodes
 namespace
 {
 
-ALWAYS_INLINE bool startsWith(const char * s, const char * end, const char * prefix)
+inline bool startsWith(const char * s, const char * end, const char * prefix)
 {
     return s + strlen(prefix) < end && 0 == memcmp(s, prefix, strlen(prefix));
 }
 
-ALWAYS_INLINE bool checkAndSkip(const char * __restrict & s, const char * end, const char * prefix)
+inline bool checkAndSkip(const char * __restrict & s, const char * end, const char * prefix)
 {
     if (startsWith(s, end, prefix))
     {
@@ -140,7 +140,7 @@ bool processCDATA(const char * __restrict & src, const char * end, char * __rest
 
 bool processElementAndSkipContent(const char * __restrict & src, const char * end, const char * tag_name)
 {
-    auto old_src = src;
+    const auto * old_src = src;
 
     if (!(src < end && *src == '<'))
         return false;