Tests and documentation

2024-11-21 23:21:59 +00:00 · 2021-02-28 00:59:27 +03:00 · 2021-02-28 00:59:27 +03:00 · 4ab18cdcd8
commit 4ab18cdcd8
parent 0ab4afeeed
1 changed files with 65 additions and 27 deletions
--- a/src/Functions/extractTextFromHTML.cpp
+++ b/src/Functions/extractTextFromHTML.cpp
@ -6,6 +6,58 @@
 #include <Common/StringUtils/StringUtils.h>


+/** A function to extract text from HTML or XHTML.
+  * It does not necessarily 100% conforms to any of the HTML, XML or XHTML standards,
+  * but the implementation is reasonably accurate and it is fast.
+  *
+  * The rules are the following:
+  *
+  * 1. Comments are skipped. Example: <!-- test -->
+  * Comment must end with -->. Nested comments are not possible.
+  * Note: constructions like <!--> <!---> are not valid comments in HTML but will be skipped by other rules.
+  *
+  * 2. CDATA is pasted verbatim.
+  * Note: CDATA is XML/XHTML specific. But we still process it for "best-effort" approach.
+  *
+  * 3. 'script' and 'style' elements are removed with all their content.
+  * Note: it's assumed that closing tag cannot appear inside content.
+  * For example, in JS string literal is has to be escaped as "<\/script>".
+  * Note: comments and CDATA is possible inside script or style - then closing tags are not searched inside CDATA.
+  * Example: <script><![CDATA[</script>]]></script>
+  * But still searched inside comments. Sometimes it becomes complicated:
+  * <script>var x = "<!--"; </script> var y = "-->"; alert(x + y);</script>
+  * Note: script and style can be the names of XML namespaces - then they are not treat like usual script or style.
+  * Example: <script:a>Hello</script:a>.
+  * Note: whitespaces are possible after closing tag name: </script > but not before: < / script>.
+  *
+  * 4. Other tags or tag-like elements are skipped without inner content.
+  * Example: <a>.</a>
+  * Note: it's expected that this HTML is illegal: <a test=">"></a>
+  * Note: it will also skip something like tags: <>, <!>, etc.
+  * Note: tag without end will be skipped to the end of input: <hello
+  * >
+  * 5. HTML and XML entities are not decoded.
+  * It should be processed by separate function.
+  *
+  * 6. Whitespaces in text are collapsed or inserted by specific rules.
+  * Whitespaces at beginning and at the end are removed.
+  * Consecutive whitespaces are collapsed.
+  * But if text is separated by other elements and there is no whitespace, it is inserted.
+  * It may be unnatural, examples: Hello<b>world</b>, Hello<!-- -->world
+  * - in HTML there will be no whitespace, but the function will insert it.
+  * But also consider: Hello<p>world</p>, Hello<br>world.
+  * This behaviour is reasonable for data analysis, e.g. convert HTML to a bag of words.
+  *
+  * 7. Also note that correct handling of whitespaces would require
+  * support of <pre></pre> and CSS display and white-space properties.
+  *
+  * Usage example:
+  *
+  * SELECT extractTextFromHTML(html) FROM url('https://yandex.ru/', RawBLOB, 'html String')
+  *
+  * - ClickHouse has embedded web browser.
+  */
+
 namespace DB
 {

@ -56,18 +108,11 @@ bool processComment(const char * __restrict & src, const char * end)
    return true;
 }

-bool processCDATA(const char * __restrict & src, const char * end, char * __restrict & dst, bool & pending_whitespace)
+bool processCDATA(const char * __restrict & src, const char * end, char * __restrict & dst)
 {
    if (!checkAndSkip(src, end, "<![CDATA["))
        return false;

-    if (dst && pending_whitespace && src < end)
-    {
-        pending_whitespace = false;
-        *dst = ' ';
-        ++dst;
-    }
-
    const char * gt = src;
    while (true)
    {
@ -131,14 +176,14 @@ bool processElementAndSkipContent(const char * __restrict & src, const char * en

        ++src;

-        /// Skip comments and CDATA
+        /// Skip CDATA
        if (*src == '!')
        {
            --src;
-            bool pending_whitespace = false;
            char * dst = nullptr;
-            processComment(src, end) || processCDATA(src, end, dst, pending_whitespace);
-            continue;
+            if (processCDATA(src, end, dst))
+                continue;
+            ++src;
        }

        if (*src != '/')
@ -178,23 +223,19 @@ bool skipTag(const char * __restrict & src, const char * end)
    return false;
 }

-void copyText(const char * __restrict & src, const char * end, char * __restrict & dst, bool & pending_whitespace)
+void copyText(const char * __restrict & src, const char * end, char * __restrict & dst, bool needs_whitespace)
 {
    while (src < end && isWhitespaceASCII(*src))
-    {
-        pending_whitespace = true;
        ++src;
-    }

-    if (pending_whitespace && src < end)
+    const char * lt = find_first_symbols<'<'>(src, end);
+
+    if (needs_whitespace && src < lt)
    {
-        pending_whitespace = false;
        *dst = ' ';
        ++dst;
    }

-    const char * lt = find_first_symbols<'<'>(src, end);
-
    while (true)
    {
        const char * ws = find_first_symbols<' ', '\t', '\n', '\r', '\f', '\v'>(src, lt);
@ -204,10 +245,7 @@ void copyText(const char * __restrict & src, const char * end, char * __restrict

        src = ws;
        while (src < lt && isWhitespaceASCII(*src))
-        {
-            pending_whitespace = true;
            ++src;
-        }

        if (src < lt)
        {
@ -232,16 +270,16 @@ size_t extract(const char * __restrict src, size_t size, char * __restrict dst)
      * - CDATA should be copied verbatim;
      */

-    char * dst_begin = dst;
    const char * end = src + size;
-    bool pending_whitespace = false;
+    char * dst_begin = dst;

    while (src < end)
    {
-        copyText(src, end, dst, pending_whitespace);
+        bool needs_whitespace = dst != dst_begin && dst[-1] != ' ';
+        copyText(src, end, dst, needs_whitespace);

        processComment(src, end)
-            || processCDATA(src, end, dst, pending_whitespace)
+            || processCDATA(src, end, dst)
            || processElementAndSkipContent(src, end, "script")
            || processElementAndSkipContent(src, end, "style")
            || skipTag(src, end);