From fd1cf49e926e2c56dacb794f70a04c1901fb8e33 Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@yandex-team.ru>
Date: Sat, 27 Feb 2021 22:22:38 +0300
Subject: [PATCH 1/6] Rewrite extractTextFromHTML function

---
 docker/test/fasttest/run.sh                   |   1 -
 src/Functions/extractTextFromHTML.cpp         | 306 +++++++++
 src/Functions/htmlOrXmlCoarseParse.cpp        | 582 ------------------
 src/Functions/registerFunctionsString.cpp     |  13 +-
 .../01674_htm_xml_coarse_parse.sql            |  13 +-
 .../01746_extract_text_from_html.reference    | 106 ++++
 .../01746_extract_text_from_html.sql          |  61 ++
 7 files changed, 485 insertions(+), 597 deletions(-)
 create mode 100644 src/Functions/extractTextFromHTML.cpp
 delete mode 100644 src/Functions/htmlOrXmlCoarseParse.cpp
 create mode 100644 tests/queries/0_stateless/01746_extract_text_from_html.reference
 create mode 100644 tests/queries/0_stateless/01746_extract_text_from_html.sql

diff --git a/docker/test/fasttest/run.sh b/docker/test/fasttest/run.sh
index 370311b13c5..1bfc91ecd92 100755
--- a/docker/test/fasttest/run.sh
+++ b/docker/test/fasttest/run.sh
@@ -356,7 +356,6 @@ function run_tests
 
         # JSON functions
         01666_blns
-        01674_htm_xml_coarse_parse
     )
 
     (time clickhouse-test --hung-check -j 8 --order=random --use-skip-list --no-long --testname --shard --zookeeper --skip "${TESTS_TO_SKIP[@]}" -- "$FASTTEST_FOCUS" 2>&1 ||:) | ts '%Y-%m-%d %H:%M:%S' | tee "$FASTTEST_OUTPUT/test_log.txt"
diff --git a/src/Functions/extractTextFromHTML.cpp b/src/Functions/extractTextFromHTML.cpp
new file mode 100644
index 00000000000..5bee4dc541f
--- /dev/null
+++ b/src/Functions/extractTextFromHTML.cpp
@@ -0,0 +1,306 @@
+#include <Columns/ColumnString.h>
+#include <Functions/FunctionFactory.h>
+#include <Functions/FunctionHelpers.h>
+#include <Functions/IFunctionImpl.h>
+#include <common/find_symbols.h>
+#include <Common/StringUtils/StringUtils.h>
+
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+    extern const int ILLEGAL_COLUMN;
+    extern const int ILLEGAL_TYPE_OF_ARGUMENT;
+}
+
+namespace
+{
+
+ALWAYS_INLINE bool startsWith(const char * s, const char * end, const char * prefix)
+{
+    return s + strlen(prefix) < end && 0 == memcmp(s, prefix, strlen(prefix));
+}
+
+ALWAYS_INLINE bool checkAndSkip(const char * __restrict & s, const char * end, const char * prefix)
+{
+    if (startsWith(s, end, prefix))
+    {
+        s += strlen(prefix);
+        return true;
+    }
+    return false;
+}
+
+bool processComment(const char * __restrict & src, const char * end)
+{
+    if (!checkAndSkip(src, end, "<!--"))
+        return false;
+
+    while (true)
+    {
+        const char * gt = find_first_symbols<'>'>(src, end);
+        if (gt >= end)
+            break;
+
+        if (gt > src + strlen("--") && gt[-1] == '-' && gt[-2] == '-')
+        {
+            src = gt + 1;
+            break;
+        }
+
+        src = gt + 1;
+    }
+
+    return true;
+}
+
+bool processCDATA(const char * __restrict & src, const char * end, char * __restrict & dst, bool & pending_whitespace)
+{
+    if (!checkAndSkip(src, end, "<![CDATA["))
+        return false;
+
+    if (pending_whitespace && src < end)
+    {
+        pending_whitespace = false;
+        *dst = ' ';
+        ++dst;
+    }
+
+    const char * gt = src;
+    while (true)
+    {
+        gt = find_first_symbols<'>'>(gt, end);
+        if (gt >= end)
+            break;
+
+        if (gt[-1] == ']' && gt[-2] == ']')
+        {
+            size_t bytes_to_copy = gt - src - strlen("]]");
+            memcpy(dst, src, bytes_to_copy);
+            dst += bytes_to_copy;
+            src = gt + 1;
+            break;
+        }
+
+        ++gt;
+    }
+
+    return true;
+}
+
+bool processElementAndSkipContent(const char * __restrict & src, const char * end, const char * tag_name)
+{
+    auto old_src = src;
+
+    if (!(src < end && *src == '<'))
+        return false;
+    ++src;
+
+    if (!checkAndSkip(src, end, tag_name))
+    {
+        src = old_src;
+        return false;
+    }
+
+    if (src >= end)
+        return false;
+
+    if (!(isWhitespaceASCII(*src) || *src == '>'))
+    {
+        src = old_src;
+        return false;
+    }
+
+    const char * gt = find_first_symbols<'>'>(src, end);
+    if (gt >= end)
+        return false;
+
+    src = gt + 1;
+
+    while (true)
+    {
+        const char * lt = find_first_symbols<'<'>(src, end);
+        src = lt;
+        if (src + 1 >= end)
+            break;
+
+        ++src;
+        if (*src != '/')
+            continue;
+        ++src;
+
+        if (checkAndSkip(src, end, tag_name))
+        {
+            while (src < end && isWhitespaceASCII(*src))
+                ++src;
+
+            if (src >= end)
+                break;
+
+            if (*src == '>')
+            {
+                ++src;
+                break;
+            }
+        }
+    }
+
+    return true;
+}
+
+bool skipTag(const char * __restrict & src, const char * end)
+{
+    if (src < end && *src == '<')
+    {
+        src = find_first_symbols<'>'>(src, end);
+        if (src < end)
+            ++src;
+
+        return true;
+    }
+
+    return false;
+}
+
+void copyText(const char * __restrict & src, const char * end, char * __restrict & dst, bool & pending_whitespace)
+{
+    while (src < end && isWhitespaceASCII(*src))
+    {
+        pending_whitespace = true;
+        ++src;
+    }
+
+    if (pending_whitespace && src < end)
+    {
+        pending_whitespace = false;
+        *dst = ' ';
+        ++dst;
+    }
+
+    const char * lt = find_first_symbols<'<'>(src, end);
+
+    while (true)
+    {
+        const char * ws = find_first_symbols<' ', '\t', '\n', '\r', '\f', '\v'>(src, lt);
+        size_t bytes_to_copy = ws - src;
+        memcpy(dst, src, bytes_to_copy);
+        dst += bytes_to_copy;
+
+        src = ws;
+        while (src < lt && isWhitespaceASCII(*src))
+        {
+            pending_whitespace = true;
+            ++src;
+        }
+
+        if (src < lt)
+        {
+            *dst = ' ';
+            ++dst;
+        }
+        else
+        {
+            break;
+        }
+    }
+
+    src = lt;
+}
+
+size_t extract(const char * __restrict src, size_t size, char * __restrict dst)
+{
+    /** There are the following rules:
+      * - comments are removed with all their content;
+      * - elements 'script' and 'style' are removed with all their content;
+      * - for other elements tags are removed but content is processed as text;
+      * - CDATA should be copied verbatim;
+      */
+
+    char * dst_begin = dst;
+    const char * end = src + size;
+    bool pending_whitespace = false;
+
+    while (src < end)
+    {
+        copyText(src, end, dst, pending_whitespace);
+
+        processComment(src, end)
+            || processCDATA(src, end, dst, pending_whitespace)
+            || processElementAndSkipContent(src, end, "script")
+            || processElementAndSkipContent(src, end, "style")
+            || skipTag(src, end);
+    }
+
+    return dst - dst_begin;
+}
+
+}
+
+
+class FunctionExtractTextFromHTML : public IFunction
+{
+public:
+    static constexpr auto name = "extractTextFromHTML";
+
+    static FunctionPtr create(const Context &) { return std::make_shared<FunctionExtractTextFromHTML>(); }
+    String getName() const override { return name; }
+    size_t getNumberOfArguments() const override { return 1; }
+    bool useDefaultImplementationForConstants() const override { return true; }
+
+    DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
+    {
+        if (!isString(arguments[0]))
+            throw Exception(
+                "Illegal type " + arguments[0]->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
+        return arguments[0];
+    }
+
+    ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t rows) const override
+    {
+        const ColumnString * src = checkAndGetColumn<ColumnString>(arguments[0].column.get());
+        if (!src)
+             throw Exception("First argument for function " + getName() + " must be string.", ErrorCodes::ILLEGAL_COLUMN);
+
+        const ColumnString::Chars & src_chars = src->getChars();
+        const ColumnString::Offsets & src_offsets = src->getOffsets();
+
+        auto res = ColumnString::create();
+
+        ColumnString::Chars & res_chars = res->getChars();
+        ColumnString::Offsets & res_offsets = res->getOffsets();
+
+        res_chars.resize(src_chars.size());
+        res_offsets.resize(src_offsets.size());
+
+        ColumnString::Offset src_offset = 0;
+        ColumnString::Offset res_offset = 0;
+
+        for (size_t i = 0; i < rows; ++i)
+        {
+            auto next_src_offset = src_offsets[i];
+
+            res_offset += extract(
+                reinterpret_cast<const char *>(&src_chars[src_offset]),
+                next_src_offset - src_offset - 1,
+                reinterpret_cast<char *>(&res_chars[res_offset]));
+
+            res_chars[res_offset] = 0;
+            ++res_offset;
+            res_offsets[i] = res_offset;
+
+            src_offset = next_src_offset;
+        }
+
+        res_chars.resize(res_offset);
+        return res;
+    }
+};
+
+void registerFunctionExtractTextFromHTML(FunctionFactory & factory)
+{
+    factory.registerFunction<FunctionExtractTextFromHTML>();
+}
+
+}
diff --git a/src/Functions/htmlOrXmlCoarseParse.cpp b/src/Functions/htmlOrXmlCoarseParse.cpp
deleted file mode 100644
index 442de3d36b0..00000000000
--- a/src/Functions/htmlOrXmlCoarseParse.cpp
+++ /dev/null
@@ -1,582 +0,0 @@
-#include <Columns/ColumnString.h>
-#include <Functions/FunctionFactory.h>
-#include <Functions/FunctionHelpers.h>
-#include <Functions/IFunctionImpl.h>
-
-#include <utility>
-#include <vector>
-#include <algorithm>
-
-#if USE_HYPERSCAN
-#   include <hs.h>
-
-namespace DB
-{
-namespace ErrorCodes
-{
-    extern const int ILLEGAL_COLUMN;
-    extern const int ILLEGAL_TYPE_OF_ARGUMENT;
-    extern const int CANNOT_ALLOCATE_MEMORY;
-    extern const int NOT_IMPLEMENTED;
-}
-
-namespace
-{
-struct HxCoarseParseImpl
-{
-private:
-    struct SpanInfo
-    {
-        SpanInfo(): id(0), match_space(std::pair<unsigned long long, unsigned long long>(0, 0)) {}  // NOLINT
-        SpanInfo(unsigned int matchId, std::pair<unsigned long long, unsigned long long> matchSpan): id(matchId), match_space(matchSpan){} // NOLINT
-        SpanInfo(const SpanInfo& obj)
-        {
-            id = obj.id;
-            match_space = obj.match_space;
-        }
-        SpanInfo& operator=(const SpanInfo& obj) = default;
-
-        unsigned int id;
-        std::pair<unsigned long long, unsigned long long> match_space;  // NOLINT
-    };
-    using SpanElement = std::vector<SpanInfo>;
-    struct Span
-    {
-        Span(): set_script(false), set_style(false), set_semi(false), is_finding_cdata(false) {}
-
-        SpanElement copy_stack;         // copy area
-        SpanElement tag_stack;          // regexp area
-        SpanInfo script_ptr;            // script pointer
-        bool set_script;                // whether set script
-        SpanInfo style_ptr;             // style pointer
-        bool set_style;                 // whether set style
-        SpanInfo semi_ptr;              // tag ptr
-        bool set_semi;                  // whether set semi
-
-        bool is_finding_cdata;
-    };
-
-    static inline void copyZone(
-        ColumnString::Offset& current_dst_string_offset,
-        ColumnString::Offset& current_copy_loc,
-        ColumnString::Chars& dst_chars,
-        const ColumnString::Chars& src_chars,
-        size_t bytes_to_copy,
-        unsigned is_space
-    )
-    {
-        bool is_last_space = false;
-        if (current_dst_string_offset == 0 || dst_chars[current_dst_string_offset - 1] == 0 || dst_chars[current_dst_string_offset - 1] == ' ')
-        {
-            is_last_space = true;
-        }
-        if (bytes_to_copy == 0)
-        {
-            if (is_space && !is_last_space)
-            {
-                dst_chars[current_dst_string_offset++] = ' ';
-            }
-        }
-        else
-        {
-            if (is_last_space && src_chars[current_copy_loc] == ' ')
-            {
-                --bytes_to_copy;
-                ++current_copy_loc;
-            }
-            if (bytes_to_copy > 0)
-            {
-                memcpySmallAllowReadWriteOverflow15(
-                    &dst_chars[current_dst_string_offset], &src_chars[current_copy_loc], bytes_to_copy);
-                current_dst_string_offset += bytes_to_copy;
-            }
-
-            // separator is space and last character is not space.
-            if (is_space && !(current_dst_string_offset == 0 || dst_chars[current_dst_string_offset - 1] == 0 || dst_chars[current_dst_string_offset - 1] == ' '))
-            {
-                dst_chars[current_dst_string_offset++] = ' ';
-            }
-        }
-        // return;
-    }
-    static inline void popArea(SpanElement& stack, unsigned long long from, unsigned long long to)  //NOLINT
-    {
-        while (!stack.empty())
-        {
-            if (to > stack.back().match_space.second && from < stack.back().match_space.second)
-            {
-                stack.pop_back();
-            }
-            else
-            {
-                break;
-            }
-        }
-        // return;
-    }
-
-    static void dealCommonTag(Span* matches)
-    {
-        while (!matches->copy_stack.empty() && matches->copy_stack.back().id != 10)
-        {
-            matches->copy_stack.pop_back();
-        }
-        if (!matches->copy_stack.empty())
-        {
-            matches->copy_stack.pop_back();
-        }
-        unsigned long long from;    // NOLINT
-        unsigned long long to;      // NOLINT
-        unsigned id;
-        for (auto begin = matches->tag_stack.begin(); begin != matches->tag_stack.end(); ++begin)
-        {
-            from = begin->match_space.first;
-            to = begin->match_space.second;
-            id = begin->id;
-            switch (id)
-            {
-                case 12:
-                case 13:
-                {
-                    popArea(matches->copy_stack, from, to);
-                    if (matches->copy_stack.empty() || from >= matches->copy_stack.back().match_space.second)
-                        matches->copy_stack.push_back(SpanInfo(id, std::make_pair(from, to)));
-                    break;
-                }
-                case 0:
-                case 2:
-                case 3:
-                case 4:
-                case 5:
-                case 6:
-                case 7:
-                case 8:
-                case 9:
-                case 10:
-                {
-                    if (!matches->set_semi || (matches->set_semi && from == matches->semi_ptr.match_space.first))
-                    {
-                        matches->set_semi = true;
-                        matches->semi_ptr = SpanInfo(id, std::make_pair(from, to));
-                    }
-                    break;
-                }
-                case 1:
-                {
-                    if (matches->set_semi)
-                    {
-                        switch (matches->semi_ptr.id)
-                        {
-                            case 0:
-                            case 2:
-                            case 3:
-                            case 6:
-                            case 7:
-                            case 10:
-                            {
-                                if (matches->semi_ptr.id == 2 || (matches->semi_ptr.id == 3 && matches->semi_ptr.match_space.second == from))
-                                {
-                                    if (!matches->set_script)
-                                    {
-                                        matches->set_script = true;
-                                        matches->script_ptr = SpanInfo(matches->semi_ptr.id, std::make_pair(matches->semi_ptr.match_space.first, to));
-                                    }
-                                }
-                                else if (matches->semi_ptr.id == 6 || (matches->semi_ptr.id == 7 && matches->semi_ptr.match_space.second == from))
-                                {
-                                    if (!matches->set_style)
-                                    {
-                                        matches->set_style = true;
-                                        matches->style_ptr = SpanInfo(matches->semi_ptr.id, std::make_pair(matches->semi_ptr.match_space.first, to));
-                                    }
-                                }
-                                popArea(matches->copy_stack, matches->semi_ptr.match_space.first, to);
-                                matches->copy_stack.push_back(SpanInfo(0, std::make_pair(matches->semi_ptr.match_space.first, to)));
-                                matches->set_semi = false;
-                                break;
-                            }
-                            case 4:
-                            case 5:
-                            case 8:
-                            case 9:
-                            {
-                                SpanInfo complete_zone;
-
-                                complete_zone.match_space.second = to;
-                                if (matches->set_script && (matches->semi_ptr.id == 4 || (matches->semi_ptr.id == 5 && matches->semi_ptr.match_space.second == from)))
-                                {
-                                    complete_zone.id = matches->script_ptr.id;
-                                    complete_zone.match_space.first = matches->script_ptr.match_space.first;
-                                    matches->set_script = false;
-                                }
-                                else if (matches->set_style && (matches->semi_ptr.id == 8 || (matches->semi_ptr.id == 9 && matches->semi_ptr.match_space.second == from)))
-                                {
-                                    complete_zone.id = matches->style_ptr.id;
-                                    complete_zone.match_space.first = matches->style_ptr.match_space.first;
-                                    matches->set_style = false;
-                                }
-                                else
-                                {
-                                    complete_zone.id = matches->semi_ptr.id;
-                                    complete_zone.match_space.first = matches->semi_ptr.match_space.first;
-                                }
-                                popArea(matches->copy_stack, complete_zone.match_space.first, complete_zone.match_space.second);
-                                matches->copy_stack.push_back(complete_zone);
-                                matches->set_semi = false;
-                                break;
-                            }
-                        }
-                    }
-                    break;
-                }
-                default:
-                {
-                    break;
-                }
-            }
-        }
-        // return;
-    }
-    static int spanCollect(unsigned int id,
-                          unsigned long long from,  // NOLINT
-                          unsigned long long to,    // NOLINT
-                          unsigned int , void * ctx)
-    {
-        Span* matches = static_cast<Span*>(ctx);
-        from = id == 12 ? from : to - patterns_length[id];
-
-        if (matches->is_finding_cdata)
-        {
-            if (id == 11)
-            {
-                matches->copy_stack.push_back(SpanInfo(id, std::make_pair(from, to)));
-                matches->is_finding_cdata = false;
-                matches->tag_stack.clear();
-                if (matches->semi_ptr.id == 10)
-                {
-                    matches->set_semi = false;
-                }
-            }
-            else if (id == 12 || id == 13)
-            {
-                popArea(matches->copy_stack, from, to);
-                if (matches->copy_stack.empty() || from >= matches->copy_stack.back().match_space.second)
-                    matches->copy_stack.push_back(SpanInfo(id, std::make_pair(from, to)));
-
-                popArea(matches->tag_stack, from, to);
-                if (matches->tag_stack.empty() || from >= matches->tag_stack.back().match_space.second)
-                    matches->tag_stack.push_back(SpanInfo(id, std::make_pair(from, to)));
-            }
-            else
-            {
-                popArea(matches->tag_stack, from, to);
-                matches->tag_stack.push_back(SpanInfo(id, std::make_pair(from, to)));
-            }
-        }
-        else
-        {
-            switch (id)
-            {
-                case 12:
-                case 13:
-                {
-                    popArea(matches->copy_stack, from, to);
-                    if (matches->copy_stack.empty() || from >= matches->copy_stack.back().match_space.second)
-                        matches->copy_stack.push_back(SpanInfo(id, std::make_pair(from, to)));
-                    break;
-                }
-                case 0:
-                case 2:
-                case 3:
-                case 4:
-                case 5:
-                case 6:
-                case 7:
-                case 8:
-                case 9:
-                {
-                    if (!matches->set_semi || (matches->set_semi && from == matches->semi_ptr.match_space.first))
-                    {
-                        matches->set_semi = true;
-                        matches->semi_ptr = SpanInfo(id, std::make_pair(from, to));
-                    }
-                    break;
-                }
-                case 10:
-                {
-                    if (!matches->set_semi || (matches->set_semi && from == matches->semi_ptr.match_space.first))
-                    {
-                        matches->set_semi = true;
-                        matches->semi_ptr = SpanInfo(id, std::make_pair(from, to));
-                    }
-                    matches->is_finding_cdata = true;
-                    matches->copy_stack.push_back(SpanInfo(id, std::make_pair(from, to)));
-                    matches->tag_stack.push_back(SpanInfo(id, std::make_pair(from, to)));
-                    break;
-                }
-                case 1:
-                {
-                    if (matches->set_semi)
-                    {
-                        switch (matches->semi_ptr.id)
-                        {
-                            case 0:
-                            case 2:
-                            case 3:
-                            case 6:
-                            case 7:
-                            case 10:
-                            {
-                                if (matches->semi_ptr.id == 2 || (matches->semi_ptr.id == 3 && matches->semi_ptr.match_space.second == from))
-                                {
-                                    if (!matches->set_script)
-                                    {
-                                        matches->set_script = true;
-                                        matches->script_ptr = SpanInfo(matches->semi_ptr.id, std::make_pair(matches->semi_ptr.match_space.first, to));
-                                    }
-                                }
-                                else if (matches->semi_ptr.id == 6 || (matches->semi_ptr.id == 7 && matches->semi_ptr.match_space.second == from))
-                                {
-                                    if (!matches->set_style)
-                                    {
-                                        matches->set_style = true;
-                                        matches->style_ptr = SpanInfo(matches->semi_ptr.id, std::make_pair(matches->semi_ptr.match_space.first, to));
-                                    }
-                                }
-                                popArea(matches->copy_stack, matches->semi_ptr.match_space.first, to);
-                                matches->copy_stack.push_back(SpanInfo(matches->semi_ptr.id, std::make_pair(matches->semi_ptr.match_space.first, to)));
-                                matches->set_semi = false;
-                                break;
-                            }
-                            case 4:
-                            case 5:
-                            case 8:
-                            case 9:
-                            {
-                                SpanInfo complete_zone;
-                                complete_zone.match_space.second = to;
-                                if (matches->set_script && (matches->semi_ptr.id == 4 || (matches->semi_ptr.id == 5 && matches->semi_ptr.match_space.second == from)))
-                                {
-                                    complete_zone.id = matches->script_ptr.id;
-                                    complete_zone.match_space.first = matches->script_ptr.match_space.first;
-                                    matches->set_script = false;
-                                }
-                                else if (matches->set_style && (matches->semi_ptr.id == 8 || (matches->semi_ptr.id == 9 && matches->semi_ptr.match_space.second == from)))
-                                {
-                                    complete_zone.id = matches->style_ptr.id;
-                                    complete_zone.match_space.first = matches->style_ptr.match_space.first;
-                                    matches->set_style = false;
-                                }
-                                else
-                                {
-                                    complete_zone.id = matches->semi_ptr.id;
-                                    complete_zone.match_space.first = matches->semi_ptr.match_space.first;
-                                }
-                                popArea(matches->copy_stack, complete_zone.match_space.first, complete_zone.match_space.second);
-                                matches->copy_stack.push_back(complete_zone);
-                                matches->set_semi = false;
-                                break;
-                            }
-                        }
-                    }
-                    break;
-                }
-                default:
-                {
-                    break;
-                }
-            }
-        }
-        return 0;
-    }
-    #if USE_HYPERSCAN
-    static hs_database_t* buildDatabase(const std::vector<const char* > &expressions,
-                                        const std::vector<unsigned> &flags,
-                                        const std::vector<unsigned> &id,
-                                        unsigned int mode)
-    {
-        hs_database_t *db;
-        hs_compile_error_t *compile_err;
-        hs_error_t err;
-        err = hs_compile_multi(expressions.data(), flags.data(), id.data(),
-                            expressions.size(), mode, nullptr, &db, &compile_err);
-
-        if (err != HS_SUCCESS)
-        {
-            hs_free_compile_error(compile_err);
-            throw Exception("Hyper scan database cannot be compiled.", ErrorCodes::CANNOT_ALLOCATE_MEMORY);
-        }
-        return db;
-    }
-    #endif
-    static std::vector<const char*> patterns;
-    static std::vector<std::size_t> patterns_length;
-    static std::vector<unsigned> patterns_flag;
-    static std::vector<unsigned> ids;
-
-public:
-    static void executeInternal(
-        const ColumnString::Chars & src_chars,
-        const ColumnString::Offsets & src_offsets,
-        ColumnString::Chars & dst_chars,
-        ColumnString::Offsets & dst_offsets)
-    {
-    #if USE_HYPERSCAN
-        hs_database_t * db = buildDatabase(patterns, patterns_flag, ids, HS_MODE_BLOCK);
-        hs_scratch_t* scratch = nullptr;
-        if (hs_alloc_scratch(db, &scratch) != HS_SUCCESS)
-        {
-            hs_free_database(db);
-            throw Exception("Unable to allocate scratch space.", ErrorCodes::CANNOT_ALLOCATE_MEMORY);
-        }
-        dst_chars.resize(src_chars.size());
-        dst_offsets.resize(src_offsets.size());
-
-        ColumnString::Offset current_src_string_offset = 0;
-        ColumnString::Offset current_dst_string_offset = 0;
-        ColumnString::Offset current_copy_loc;
-        ColumnString::Offset current_copy_end;
-        unsigned is_space;
-        size_t bytes_to_copy;
-        Span match_zoneall;
-
-        for (size_t off = 0; off < src_offsets.size(); ++off)
-        {
-            hs_scan(db, reinterpret_cast<const char *>(&src_chars[current_src_string_offset]), src_offsets[off] - current_src_string_offset, 0, scratch, spanCollect, &match_zoneall);
-            if (match_zoneall.is_finding_cdata)
-            {
-                dealCommonTag(&match_zoneall);
-            }
-            SpanElement& match_zone = match_zoneall.copy_stack;
-            current_copy_loc = current_src_string_offset;
-            if (match_zone.empty())
-            {
-                current_copy_end = src_offsets[off];
-                is_space = 0;
-            }
-            else
-            {
-                current_copy_end = current_src_string_offset + match_zone.begin()->match_space.first;
-                is_space = (match_zone.begin()->id == 12 || match_zone.begin()->id == 13)?1:0;
-            }
-
-            bytes_to_copy = current_copy_end - current_copy_loc;
-            copyZone(current_dst_string_offset, current_copy_loc, dst_chars, src_chars, bytes_to_copy, is_space);
-            for (auto begin = match_zone.begin(); begin != match_zone.end(); ++begin)
-            {
-                current_copy_loc = current_src_string_offset + begin->match_space.second;
-                if (begin + 1 >= match_zone.end())
-                {
-                    current_copy_end = src_offsets[off];
-                    is_space = 0;
-                }
-                else
-                {
-                    current_copy_end = current_src_string_offset + (begin+1)->match_space.first;
-                    is_space = ((begin+1)->id == 12 || (begin+1)->id == 13)?1:0;
-                }
-                bytes_to_copy = current_copy_end - current_copy_loc;
-                copyZone(current_dst_string_offset, current_copy_loc, dst_chars, src_chars, bytes_to_copy, is_space);
-            }
-            if (current_dst_string_offset > 1 && dst_chars[current_dst_string_offset - 2] == ' ')
-            {
-                dst_chars[current_dst_string_offset - 2] = 0;
-                --current_dst_string_offset;
-            }
-            dst_offsets[off] = current_dst_string_offset;
-            current_src_string_offset = src_offsets[off];
-            match_zoneall.copy_stack.clear();
-            match_zoneall.tag_stack.clear();
-        }
-            dst_chars.resize(dst_chars.size());
-            hs_free_scratch(scratch);
-            hs_free_database(db);
-    #else
-        (void)src_chars;
-        (void)src_offsets;
-        (void)dst_chars;
-        (void)dst_offsets;
-        throw Exception(
-            "htmlOrXmlCoarseParse is not implemented when hyperscan is off (is it x86 processor?)",
-            ErrorCodes::NOT_IMPLEMENTED);
-    #endif
-    }
-};
-
-std::vector<const char*> HxCoarseParseImpl::patterns =
-    {
-        "<[^\\s<>]",       // 0  "<", except "< ", "<<", "<>"
-        ">",               // 1  ">"
-        "<script\\s",      // 2  <script xxxxx>
-        "<script",         // 3  <script>
-        "</script\\s",     // 4  </script xxxx>
-        "</script",        // 5  </script>
-        "<style\\s",       // 6  <style xxxxxx>
-        "<style",          // 7  <style>
-        "</style\\s",      // 8  </style xxxxx>
-        "</style",         // 9  </style>
-        "<!\\[CDATA\\[",   // 10 <![CDATA[xxxxxx]]>
-        "\\]\\]>",         // 11 ]]>
-        "\\s{2,}",         // 12 "   ", continuous blanks
-        "[^\\S ]"          // 13 "\n", "\t" and other white space, it does not include single ' '.
-    };
-std::vector<std::size_t> HxCoarseParseImpl::patterns_length =
-    {
-        2, 1, 8, 7, 9, 8, 7, 6, 8, 7, 9, 3, 0, 1
-    };
-#if USE_HYPERSCAN
-std::vector<unsigned> HxCoarseParseImpl::patterns_flag =
-    {
-        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, HS_FLAG_SOM_LEFTMOST, 0
-    };
-#endif
-std::vector<unsigned> HxCoarseParseImpl::ids =
-    {
-        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13
-    };
-
-class FunctionHtmlOrXmlCoarseParse : public IFunction
-{
-public:
-    static constexpr auto name = "htmlOrXmlCoarseParse";
-
-    static FunctionPtr create(const Context &) {return std::make_shared<FunctionHtmlOrXmlCoarseParse>(); }
-
-    String getName() const override {return name;}
-
-    size_t getNumberOfArguments() const override {return 1;}
-
-    DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
-    {
-        if (!isString(arguments[0]))
-            throw Exception(
-                "Illegal type " + arguments[0]->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
-        return arguments[0];
-    }
-
-    bool useDefaultImplementationForConstants() const override {return true;}
-
-    ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & , size_t) const override
-    {
-        const auto & strcolumn = arguments[0].column;
-        if (const ColumnString* html_sentence = checkAndGetColumn<ColumnString>(strcolumn.get()))
-        {
-            auto col_res = ColumnString::create();
-            HxCoarseParseImpl::executeInternal(html_sentence->getChars(), html_sentence->getOffsets(), col_res->getChars(), col_res->getOffsets());
-            return col_res;
-        }
-        else
-        {
-            throw Exception("First argument for function " + getName() + " must be string.", ErrorCodes::ILLEGAL_COLUMN);
-        }
-    }
-};
-}
-
-void registerFunctionHtmlOrXmlCoarseParse(FunctionFactory & factory)
-{
-    factory.registerFunction<FunctionHtmlOrXmlCoarseParse>();
-}
-
-}
-#endif
diff --git a/src/Functions/registerFunctionsString.cpp b/src/Functions/registerFunctionsString.cpp
index b6327dfb92f..f6f95489f82 100644
--- a/src/Functions/registerFunctionsString.cpp
+++ b/src/Functions/registerFunctionsString.cpp
@@ -6,9 +6,7 @@ namespace DB
 {
 
 class FunctionFactory;
-#if USE_HYPERSCAN
-void registerFunctionHtmlOrXmlCoarseParse(FunctionFactory &);
-#endif
+
 void registerFunctionRepeat(FunctionFactory &);
 void registerFunctionEmpty(FunctionFactory &);
 void registerFunctionNotEmpty(FunctionFactory &);
@@ -35,8 +33,9 @@ void registerFunctionRegexpQuoteMeta(FunctionFactory &);
 void registerFunctionNormalizeQuery(FunctionFactory &);
 void registerFunctionNormalizedQueryHash(FunctionFactory &);
 void registerFunctionCountMatches(FunctionFactory &);
-void registerFunctionEncodeXMLComponent(FunctionFactory & factory);
-void registerFunctionDecodeXMLComponent(FunctionFactory & factory);
+void registerFunctionEncodeXMLComponent(FunctionFactory &);
+void registerFunctionDecodeXMLComponent(FunctionFactory &);
+void registerFunctionExtractTextFromHTML(FunctionFactory &);
 
 
 #if USE_BASE64
@@ -47,9 +46,6 @@ void registerFunctionTryBase64Decode(FunctionFactory &);
 
 void registerFunctionsString(FunctionFactory & factory)
 {
-#if USE_HYPERSCAN
-    registerFunctionHtmlOrXmlCoarseParse(factory);
-#endif
     registerFunctionRepeat(factory);
     registerFunctionEmpty(factory);
     registerFunctionNotEmpty(factory);
@@ -78,6 +74,7 @@ void registerFunctionsString(FunctionFactory & factory)
     registerFunctionCountMatches(factory);
     registerFunctionEncodeXMLComponent(factory);
     registerFunctionDecodeXMLComponent(factory);
+    registerFunctionExtractTextFromHTML(factory);
 #if USE_BASE64
     registerFunctionBase64Encode(factory);
     registerFunctionBase64Decode(factory);
diff --git a/tests/queries/0_stateless/01674_htm_xml_coarse_parse.sql b/tests/queries/0_stateless/01674_htm_xml_coarse_parse.sql
index 65c243687c1..fd1292eb3fb 100644
--- a/tests/queries/0_stateless/01674_htm_xml_coarse_parse.sql
+++ b/tests/queries/0_stateless/01674_htm_xml_coarse_parse.sql
@@ -1,8 +1,9 @@
-SELECT htmlOrXmlCoarseParse('<script>Here is script.</script>');
-SELECT htmlOrXmlCoarseParse('<style>Here is style.</style>');
-SELECT htmlOrXmlCoarseParse('<![CDATA[Here is CDTATA.]]>');
-SELECT htmlOrXmlCoarseParse('This is a     white   space test.');
-SELECT htmlOrXmlCoarseParse('This is a complex test. <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"\n "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"><html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"><![CDATA[<script type="text/javascript">Hello, world</script> ]]><hello />world<![CDATA[ <style> ]]> hello</style>\n<script><![CDATA[</script>]]>hello</script>\n</html>');
+SELECT extractTextFromHTML('<script>Here is script.</script>');
+SELECT extractTextFromHTML('<style>Here is style.</style>');
+SELECT extractTextFromHTML('<![CDATA[Here is CDTATA.]]>');
+SELECT extractTextFromHTML('This is a     white   space test.');
+SELECT extractTextFromHTML('This is a complex test. <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"\n "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"><html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"><![CDATA[<script type="text/javascript">Hello, world</script> ]]><hello />world<![CDATA[ <style> ]]> hello</style>\n<script><![CDATA[</script>]]>hello</script>\n</html>');
+
 DROP TABLE IF EXISTS defaults;
 CREATE TABLE defaults
 (
@@ -11,5 +12,5 @@ CREATE TABLE defaults
 
 INSERT INTO defaults values ('<common tag>hello, world<tag>'), ('<script desc=content> some content </script>'), ('<![CDATA[hello, world]]>'), ('white space    collapse');
 
-SELECT htmlOrXmlCoarseParse(stringColumn) FROM defaults;
+SELECT extractTextFromHTML(stringColumn) FROM defaults;
 DROP table defaults;
diff --git a/tests/queries/0_stateless/01746_extract_text_from_html.reference b/tests/queries/0_stateless/01746_extract_text_from_html.reference
new file mode 100644
index 00000000000..9f98fb08cde
--- /dev/null
+++ b/tests/queries/0_stateless/01746_extract_text_from_html.reference
@@ -0,0 +1,106 @@
+-- { echo }
+
+SELECT extractTextFromHTML('');
+
+SELECT extractTextFromHTML(' ');
+
+SELECT extractTextFromHTML('  ');
+
+SELECT extractTextFromHTML('Hello');
+Hello
+SELECT extractTextFromHTML('Hello, world');
+Hello, world
+SELECT extractTextFromHTML('Hello,  world');
+Hello, world
+SELECT extractTextFromHTML(' Hello,  world');
+ Hello, world
+SELECT extractTextFromHTML(' Hello,  world ');
+ Hello, world
+SELECT extractTextFromHTML(' \t Hello,\rworld \n ');
+ Hello, world
+SELECT extractTextFromHTML('Hello<world');
+Hello
+SELECT extractTextFromHTML('Hello < world');
+Hello
+SELECT extractTextFromHTML('Hello > world');
+Hello > world
+SELECT extractTextFromHTML('Hello<world>');
+Hello
+SELECT extractTextFromHTML('Hello<>world');
+Helloworld
+SELECT extractTextFromHTML('Hello<!>world');
+Helloworld
+SELECT extractTextFromHTML('Hello<!->world');
+Helloworld
+SELECT extractTextFromHTML('Hello<!-->world');
+Helloworld
+SELECT extractTextFromHTML('Hello<!--->world');
+Helloworld
+SELECT extractTextFromHTML('Hello<!---->world');
+Helloworld
+SELECT extractTextFromHTML('Hello <!-- --> World');
+Hello World
+SELECT extractTextFromHTML('Hello<!-- --> World');
+Hello World
+SELECT extractTextFromHTML('Hello<!-- -->World');
+HelloWorld
+SELECT extractTextFromHTML('Hello <!-- -->World');
+Hello World
+SELECT extractTextFromHTML('Hello <u> World</u>');
+Hello World
+SELECT extractTextFromHTML('Hello <u>World</u>');
+Hello World
+SELECT extractTextFromHTML('Hello<u>World</u>');
+HelloWorld
+SELECT extractTextFromHTML('Hello<u> World</u>');
+Hello World
+SELECT extractTextFromHTML('<![CDATA[ \t Hello,\rworld \n ]]>');
+ \t Hello,\rworld \n 
+SELECT extractTextFromHTML('Hello <![CDATA[Hello\tworld]]> world!');
+Hello Hello\tworld world!
+SELECT extractTextFromHTML('Hello<![CDATA[Hello\tworld]]>world!');
+HelloHello\tworldworld!
+SELECT extractTextFromHTML('Hello <![CDATA[Hello <b>world</b>]]> world!');
+Hello Hello <b>world</b> world!
+SELECT extractTextFromHTML('<![CDATA[<sender>John Smith</sender>]]>');
+<sender>John Smith</sender>
+SELECT extractTextFromHTML('<![CDATA[<sender>John <![CDATA[Smith</sender>]]>');
+<sender>John <![CDATA[Smith</sender>
+SELECT extractTextFromHTML('<![CDATA[<sender>John <![CDATA[]]>Smith</sender>]]>');
+<sender>John <![CDATA[Smith]]>
+SELECT extractTextFromHTML('<![CDATA[<sender>John ]]><![CDATA[Smith</sender>]]>');
+<sender>John Smith</sender>
+SELECT extractTextFromHTML('<![CDATA[<sender>John ]]> <![CDATA[Smith</sender>]]>');
+<sender>John  Smith</sender>
+SELECT extractTextFromHTML('<![CDATA[<sender>John]]> <![CDATA[Smith</sender>]]>');
+<sender>John Smith</sender>
+SELECT extractTextFromHTML('<![CDATA[<sender>John ]]>]]><![CDATA[Smith</sender>]]>');
+<sender>John ]]>Smith</sender>
+SELECT extractTextFromHTML('Hello<script>World</script> goodbye');
+Hello goodbye
+SELECT extractTextFromHTML('Hello<script >World</script> goodbye');
+Hello goodbye
+SELECT extractTextFromHTML('Hello<scripta>World</scripta> goodbye');
+HelloWorld goodbye
+SELECT extractTextFromHTML('Hello<script type="text/javascript">World</script> goodbye');
+Hello goodbye
+SELECT extractTextFromHTML('Hello<style type="text/css">World</style> goodbye');
+Hello goodbye
+SELECT extractTextFromHTML('Hello<script:p>World</script:p> goodbye');
+HelloWorld goodbye
+SELECT extractTextFromHTML('Hello<script:p type="text/javascript">World</script:p> goodbye');
+HelloWorld goodbye
+SELECT extractTextFromHTML('Hello<style type="text/css">World <!-- abc --> </style> goodbye');
+Hello goodbye
+SELECT extractTextFromHTML('Hello<style type="text/css">World <!-- abc --> </style \n > goodbye');
+Hello goodbye
+SELECT extractTextFromHTML('Hello<style type="text/css">World <!-- abc --> </ style> goodbye');
+Hello
+SELECT extractTextFromHTML('Hello<style type="text/css">World <!-- abc --> </stylea> goodbye');
+Hello
+SELECT extractTextFromHTML('Hello<style type="text/css">World <![CDATA[</style>]]> </stylea> goodbye');
+Hello]]> goodbye
+SELECT extractTextFromHTML('Hello<style type="text/css">World <script>abc</script> </stylea> goodbye');
+Hello
+SELECT extractTextFromHTML('Hello<style type="text/css">World <script>abc</script> </style> goodbye');
+Hello goodbye
diff --git a/tests/queries/0_stateless/01746_extract_text_from_html.sql b/tests/queries/0_stateless/01746_extract_text_from_html.sql
new file mode 100644
index 00000000000..9bdd153228f
--- /dev/null
+++ b/tests/queries/0_stateless/01746_extract_text_from_html.sql
@@ -0,0 +1,61 @@
+-- { echo }
+
+SELECT extractTextFromHTML('');
+SELECT extractTextFromHTML(' ');
+SELECT extractTextFromHTML('  ');
+SELECT extractTextFromHTML('Hello');
+SELECT extractTextFromHTML('Hello, world');
+SELECT extractTextFromHTML('Hello,  world');
+SELECT extractTextFromHTML(' Hello,  world');
+SELECT extractTextFromHTML(' Hello,  world ');
+SELECT extractTextFromHTML(' \t Hello,\rworld \n ');
+
+SELECT extractTextFromHTML('Hello<world');
+SELECT extractTextFromHTML('Hello < world');
+SELECT extractTextFromHTML('Hello > world');
+SELECT extractTextFromHTML('Hello<world>');
+SELECT extractTextFromHTML('Hello<>world');
+SELECT extractTextFromHTML('Hello<!>world');
+SELECT extractTextFromHTML('Hello<!->world');
+SELECT extractTextFromHTML('Hello<!-->world');
+SELECT extractTextFromHTML('Hello<!--->world');
+SELECT extractTextFromHTML('Hello<!---->world');
+
+SELECT extractTextFromHTML('Hello <!-- --> World');
+SELECT extractTextFromHTML('Hello<!-- --> World');
+SELECT extractTextFromHTML('Hello<!-- -->World');
+SELECT extractTextFromHTML('Hello <!-- -->World');
+SELECT extractTextFromHTML('Hello <u> World</u>');
+SELECT extractTextFromHTML('Hello <u>World</u>');
+SELECT extractTextFromHTML('Hello<u>World</u>');
+SELECT extractTextFromHTML('Hello<u> World</u>');
+
+SELECT extractTextFromHTML('<![CDATA[ \t Hello,\rworld \n ]]>');
+SELECT extractTextFromHTML('Hello <![CDATA[Hello\tworld]]> world!');
+SELECT extractTextFromHTML('Hello<![CDATA[Hello\tworld]]>world!');
+
+SELECT extractTextFromHTML('Hello <![CDATA[Hello <b>world</b>]]> world!');
+SELECT extractTextFromHTML('<![CDATA[<sender>John Smith</sender>]]>');
+SELECT extractTextFromHTML('<![CDATA[<sender>John <![CDATA[Smith</sender>]]>');
+SELECT extractTextFromHTML('<![CDATA[<sender>John <![CDATA[]]>Smith</sender>]]>');
+SELECT extractTextFromHTML('<![CDATA[<sender>John ]]><![CDATA[Smith</sender>]]>');
+SELECT extractTextFromHTML('<![CDATA[<sender>John ]]> <![CDATA[Smith</sender>]]>');
+SELECT extractTextFromHTML('<![CDATA[<sender>John]]> <![CDATA[Smith</sender>]]>');
+SELECT extractTextFromHTML('<![CDATA[<sender>John ]]>]]><![CDATA[Smith</sender>]]>');
+
+SELECT extractTextFromHTML('Hello<script>World</script> goodbye');
+SELECT extractTextFromHTML('Hello<script >World</script> goodbye');
+SELECT extractTextFromHTML('Hello<scripta>World</scripta> goodbye');
+SELECT extractTextFromHTML('Hello<script type="text/javascript">World</script> goodbye');
+SELECT extractTextFromHTML('Hello<style type="text/css">World</style> goodbye');
+SELECT extractTextFromHTML('Hello<script:p>World</script:p> goodbye');
+SELECT extractTextFromHTML('Hello<script:p type="text/javascript">World</script:p> goodbye');
+
+SELECT extractTextFromHTML('Hello<style type="text/css">World <!-- abc --> </style> goodbye');
+SELECT extractTextFromHTML('Hello<style type="text/css">World <!-- abc --> </style \n > goodbye');
+SELECT extractTextFromHTML('Hello<style type="text/css">World <!-- abc --> </ style> goodbye');
+SELECT extractTextFromHTML('Hello<style type="text/css">World <!-- abc --> </stylea> goodbye');
+
+SELECT extractTextFromHTML('Hello<style type="text/css">World <![CDATA[</style>]]> </stylea> goodbye');
+SELECT extractTextFromHTML('Hello<style type="text/css">World <script>abc</script> </stylea> goodbye');
+SELECT extractTextFromHTML('Hello<style type="text/css">World <script>abc</script> </style> goodbye');

From 2ac673b12a9f02a36136263abd873159e28e4de8 Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@yandex-team.ru>
Date: Sat, 27 Feb 2021 22:33:55 +0300
Subject: [PATCH 2/6] Update logic and tests

---
 src/Functions/extractTextFromHTML.cpp         | 22 +++++--
 .../01674_htm_xml_coarse_parse.reference      |  2 +-
 .../01746_extract_text_from_html.reference    | 57 +------------------
 .../01746_extract_text_from_html.sql          |  1 +
 4 files changed, 22 insertions(+), 60 deletions(-)

diff --git a/src/Functions/extractTextFromHTML.cpp b/src/Functions/extractTextFromHTML.cpp
index 5bee4dc541f..c6a9b84b33e 100644
--- a/src/Functions/extractTextFromHTML.cpp
+++ b/src/Functions/extractTextFromHTML.cpp
@@ -61,7 +61,7 @@ bool processCDATA(const char * __restrict & src, const char * end, char * __rest
     if (!checkAndSkip(src, end, "<![CDATA["))
         return false;
 
-    if (pending_whitespace && src < end)
+    if (dst && pending_whitespace && src < end)
     {
         pending_whitespace = false;
         *dst = ' ';
@@ -77,9 +77,12 @@ bool processCDATA(const char * __restrict & src, const char * end, char * __rest
 
         if (gt[-1] == ']' && gt[-2] == ']')
         {
-            size_t bytes_to_copy = gt - src - strlen("]]");
-            memcpy(dst, src, bytes_to_copy);
-            dst += bytes_to_copy;
+            if (dst)
+            {
+                size_t bytes_to_copy = gt - src - strlen("]]");
+                memcpy(dst, src, bytes_to_copy);
+                dst += bytes_to_copy;
+            }
             src = gt + 1;
             break;
         }
@@ -127,6 +130,17 @@ bool processElementAndSkipContent(const char * __restrict & src, const char * en
             break;
 
         ++src;
+
+        /// Skip comments and CDATA
+        if (*src == '!')
+        {
+            --src;
+            bool pending_whitespace = false;
+            char * dst = nullptr;
+            processComment(src, end) || processCDATA(src, end, dst, pending_whitespace);
+            continue;
+        }
+
         if (*src != '/')
             continue;
         ++src;
diff --git a/tests/queries/0_stateless/01674_htm_xml_coarse_parse.reference b/tests/queries/0_stateless/01674_htm_xml_coarse_parse.reference
index 63b3707b9b4..72af13aedd0 100644
--- a/tests/queries/0_stateless/01674_htm_xml_coarse_parse.reference
+++ b/tests/queries/0_stateless/01674_htm_xml_coarse_parse.reference
@@ -2,7 +2,7 @@
 
 Here is CDTATA.
 This is a white space test.
-This is a complex test. <script type="text/javascript">Hello, world</script> world <style> hello
+This is a complex test. <script type="text/javascript">Hello, world</script> world <style>  hello  
 hello, world
 
 hello, world
diff --git a/tests/queries/0_stateless/01746_extract_text_from_html.reference b/tests/queries/0_stateless/01746_extract_text_from_html.reference
index 9f98fb08cde..7dfdee1cb49 100644
--- a/tests/queries/0_stateless/01746_extract_text_from_html.reference
+++ b/tests/queries/0_stateless/01746_extract_text_from_html.reference
@@ -1,106 +1,53 @@
--- { echo }
 
-SELECT extractTextFromHTML('');
 
-SELECT extractTextFromHTML(' ');
 
-SELECT extractTextFromHTML('  ');
-
-SELECT extractTextFromHTML('Hello');
 Hello
-SELECT extractTextFromHTML('Hello, world');
 Hello, world
-SELECT extractTextFromHTML('Hello,  world');
 Hello, world
-SELECT extractTextFromHTML(' Hello,  world');
  Hello, world
-SELECT extractTextFromHTML(' Hello,  world ');
  Hello, world
-SELECT extractTextFromHTML(' \t Hello,\rworld \n ');
  Hello, world
-SELECT extractTextFromHTML('Hello<world');
 Hello
-SELECT extractTextFromHTML('Hello < world');
 Hello
-SELECT extractTextFromHTML('Hello > world');
 Hello > world
-SELECT extractTextFromHTML('Hello<world>');
 Hello
-SELECT extractTextFromHTML('Hello<>world');
 Helloworld
-SELECT extractTextFromHTML('Hello<!>world');
 Helloworld
-SELECT extractTextFromHTML('Hello<!->world');
 Helloworld
-SELECT extractTextFromHTML('Hello<!-->world');
 Helloworld
-SELECT extractTextFromHTML('Hello<!--->world');
 Helloworld
-SELECT extractTextFromHTML('Hello<!---->world');
 Helloworld
-SELECT extractTextFromHTML('Hello <!-- --> World');
 Hello World
-SELECT extractTextFromHTML('Hello<!-- --> World');
 Hello World
-SELECT extractTextFromHTML('Hello<!-- -->World');
 HelloWorld
-SELECT extractTextFromHTML('Hello <!-- -->World');
 Hello World
-SELECT extractTextFromHTML('Hello <u> World</u>');
 Hello World
-SELECT extractTextFromHTML('Hello <u>World</u>');
 Hello World
-SELECT extractTextFromHTML('Hello<u>World</u>');
 HelloWorld
-SELECT extractTextFromHTML('Hello<u> World</u>');
 Hello World
-SELECT extractTextFromHTML('<![CDATA[ \t Hello,\rworld \n ]]>');
  \t Hello,\rworld \n 
-SELECT extractTextFromHTML('Hello <![CDATA[Hello\tworld]]> world!');
 Hello Hello\tworld world!
-SELECT extractTextFromHTML('Hello<![CDATA[Hello\tworld]]>world!');
 HelloHello\tworldworld!
-SELECT extractTextFromHTML('Hello <![CDATA[Hello <b>world</b>]]> world!');
 Hello Hello <b>world</b> world!
-SELECT extractTextFromHTML('<![CDATA[<sender>John Smith</sender>]]>');
 <sender>John Smith</sender>
-SELECT extractTextFromHTML('<![CDATA[<sender>John <![CDATA[Smith</sender>]]>');
 <sender>John <![CDATA[Smith</sender>
-SELECT extractTextFromHTML('<![CDATA[<sender>John <![CDATA[]]>Smith</sender>]]>');
 <sender>John <![CDATA[Smith]]>
-SELECT extractTextFromHTML('<![CDATA[<sender>John ]]><![CDATA[Smith</sender>]]>');
 <sender>John Smith</sender>
-SELECT extractTextFromHTML('<![CDATA[<sender>John ]]> <![CDATA[Smith</sender>]]>');
 <sender>John  Smith</sender>
-SELECT extractTextFromHTML('<![CDATA[<sender>John]]> <![CDATA[Smith</sender>]]>');
 <sender>John Smith</sender>
-SELECT extractTextFromHTML('<![CDATA[<sender>John ]]>]]><![CDATA[Smith</sender>]]>');
 <sender>John ]]>Smith</sender>
-SELECT extractTextFromHTML('Hello<script>World</script> goodbye');
 Hello goodbye
-SELECT extractTextFromHTML('Hello<script >World</script> goodbye');
 Hello goodbye
-SELECT extractTextFromHTML('Hello<scripta>World</scripta> goodbye');
 HelloWorld goodbye
-SELECT extractTextFromHTML('Hello<script type="text/javascript">World</script> goodbye');
 Hello goodbye
-SELECT extractTextFromHTML('Hello<style type="text/css">World</style> goodbye');
 Hello goodbye
-SELECT extractTextFromHTML('Hello<script:p>World</script:p> goodbye');
 HelloWorld goodbye
-SELECT extractTextFromHTML('Hello<script:p type="text/javascript">World</script:p> goodbye');
 HelloWorld goodbye
-SELECT extractTextFromHTML('Hello<style type="text/css">World <!-- abc --> </style> goodbye');
 Hello goodbye
-SELECT extractTextFromHTML('Hello<style type="text/css">World <!-- abc --> </style \n > goodbye');
 Hello goodbye
-SELECT extractTextFromHTML('Hello<style type="text/css">World <!-- abc --> </ style> goodbye');
 Hello
-SELECT extractTextFromHTML('Hello<style type="text/css">World <!-- abc --> </stylea> goodbye');
 Hello
-SELECT extractTextFromHTML('Hello<style type="text/css">World <![CDATA[</style>]]> </stylea> goodbye');
-Hello]]> goodbye
-SELECT extractTextFromHTML('Hello<style type="text/css">World <script>abc</script> </stylea> goodbye');
 Hello
-SELECT extractTextFromHTML('Hello<style type="text/css">World <script>abc</script> </style> goodbye');
+Hello goodbye
+Hello
 Hello goodbye
diff --git a/tests/queries/0_stateless/01746_extract_text_from_html.sql b/tests/queries/0_stateless/01746_extract_text_from_html.sql
index 9bdd153228f..0004849df87 100644
--- a/tests/queries/0_stateless/01746_extract_text_from_html.sql
+++ b/tests/queries/0_stateless/01746_extract_text_from_html.sql
@@ -57,5 +57,6 @@ SELECT extractTextFromHTML('Hello<style type="text/css">World <!-- abc --> </ st
 SELECT extractTextFromHTML('Hello<style type="text/css">World <!-- abc --> </stylea> goodbye');
 
 SELECT extractTextFromHTML('Hello<style type="text/css">World <![CDATA[</style>]]> </stylea> goodbye');
+SELECT extractTextFromHTML('Hello<style type="text/css">World <![CDATA[</style>]]> </style> goodbye');
 SELECT extractTextFromHTML('Hello<style type="text/css">World <script>abc</script> </stylea> goodbye');
 SELECT extractTextFromHTML('Hello<style type="text/css">World <script>abc</script> </style> goodbye');

From 0ab4afeeed567b4626b45bd7c7b984c085a6916b Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@yandex-team.ru>
Date: Sun, 28 Feb 2021 00:55:56 +0300
Subject: [PATCH 3/6] Tests and documentation

---
 .../01674_htm_xml_coarse_parse.reference      |   2 +-
 .../01746_extract_text_from_html.reference    | 107 ++++++++++++++----
 .../01746_extract_text_from_html.sql          |  10 ++
 3 files changed, 98 insertions(+), 21 deletions(-)

diff --git a/tests/queries/0_stateless/01674_htm_xml_coarse_parse.reference b/tests/queries/0_stateless/01674_htm_xml_coarse_parse.reference
index 72af13aedd0..9cca4934551 100644
--- a/tests/queries/0_stateless/01674_htm_xml_coarse_parse.reference
+++ b/tests/queries/0_stateless/01674_htm_xml_coarse_parse.reference
@@ -2,7 +2,7 @@
 
 Here is CDTATA.
 This is a white space test.
-This is a complex test. <script type="text/javascript">Hello, world</script> world <style>  hello  
+This is a complex test.<script type="text/javascript">Hello, world</script> world <style> hello
 hello, world
 
 hello, world
diff --git a/tests/queries/0_stateless/01746_extract_text_from_html.reference b/tests/queries/0_stateless/01746_extract_text_from_html.reference
index 7dfdee1cb49..ee05e085ba4 100644
--- a/tests/queries/0_stateless/01746_extract_text_from_html.reference
+++ b/tests/queries/0_stateless/01746_extract_text_from_html.reference
@@ -1,53 +1,120 @@
+-- { echo }
 
+SELECT extractTextFromHTML('');
 
+SELECT extractTextFromHTML(' ');
 
+SELECT extractTextFromHTML('  ');
+
+SELECT extractTextFromHTML('Hello');
 Hello
+SELECT extractTextFromHTML('Hello, world');
 Hello, world
+SELECT extractTextFromHTML('Hello,  world');
 Hello, world
- Hello, world
- Hello, world
- Hello, world
+SELECT extractTextFromHTML(' Hello,  world');
+Hello, world
+SELECT extractTextFromHTML(' Hello,  world ');
+Hello, world
+SELECT extractTextFromHTML(' \t Hello,\rworld \n ');
+Hello, world
+SELECT extractTextFromHTML('Hello<world');
 Hello
+SELECT extractTextFromHTML('Hello < world');
 Hello
+SELECT extractTextFromHTML('Hello > world');
 Hello > world
+SELECT extractTextFromHTML('Hello<world>');
 Hello
-Helloworld
-Helloworld
-Helloworld
-Helloworld
-Helloworld
-Helloworld
+SELECT extractTextFromHTML('Hello<>world');
+Hello world
+SELECT extractTextFromHTML('Hello<!>world');
+Hello world
+SELECT extractTextFromHTML('Hello<!->world');
+Hello world
+SELECT extractTextFromHTML('Hello<!-->world');
+Hello world
+SELECT extractTextFromHTML('Hello<!--->world');
+Hello world
+SELECT extractTextFromHTML('Hello<!---->world');
+Hello world
+SELECT extractTextFromHTML('Hello <!-- --> World');
 Hello World
+SELECT extractTextFromHTML('Hello<!-- --> World');
 Hello World
-HelloWorld
+SELECT extractTextFromHTML('Hello<!-- -->World');
 Hello World
+SELECT extractTextFromHTML('Hello <!-- -->World');
 Hello World
+SELECT extractTextFromHTML('Hello <u> World</u>');
 Hello World
-HelloWorld
+SELECT extractTextFromHTML('Hello <u>World</u>');
 Hello World
+SELECT extractTextFromHTML('Hello<u>World</u>');
+Hello World
+SELECT extractTextFromHTML('Hello<u> World</u>');
+Hello World
+SELECT extractTextFromHTML('<![CDATA[ \t Hello,\rworld \n ]]>');
  \t Hello,\rworld \n 
-Hello Hello\tworld world!
-HelloHello\tworldworld!
-Hello Hello <b>world</b> world!
+SELECT extractTextFromHTML('Hello <![CDATA[Hello\tworld]]> world!');
+HelloHello\tworld world!
+SELECT extractTextFromHTML('Hello<![CDATA[Hello\tworld]]>world!');
+HelloHello\tworld world!
+SELECT extractTextFromHTML('Hello <![CDATA[Hello <b>world</b>]]> world!');
+HelloHello <b>world</b> world!
+SELECT extractTextFromHTML('<![CDATA[<sender>John Smith</sender>]]>');
 <sender>John Smith</sender>
+SELECT extractTextFromHTML('<![CDATA[<sender>John <![CDATA[Smith</sender>]]>');
 <sender>John <![CDATA[Smith</sender>
-<sender>John <![CDATA[Smith]]>
+SELECT extractTextFromHTML('<![CDATA[<sender>John <![CDATA[]]>Smith</sender>]]>');
+<sender>John <![CDATA[ Smith ]]>
+SELECT extractTextFromHTML('<![CDATA[<sender>John ]]><![CDATA[Smith</sender>]]>');
 <sender>John Smith</sender>
-<sender>John  Smith</sender>
+SELECT extractTextFromHTML('<![CDATA[<sender>John ]]> <![CDATA[Smith</sender>]]>');
 <sender>John Smith</sender>
+SELECT extractTextFromHTML('<![CDATA[<sender>John]]> <![CDATA[Smith</sender>]]>');
+<sender>JohnSmith</sender>
+SELECT extractTextFromHTML('<![CDATA[<sender>John ]]>]]><![CDATA[Smith</sender>]]>');
 <sender>John ]]>Smith</sender>
+SELECT extractTextFromHTML('Hello<script>World</script> goodbye');
 Hello goodbye
+SELECT extractTextFromHTML('Hello<script >World</script> goodbye');
 Hello goodbye
-HelloWorld goodbye
+SELECT extractTextFromHTML('Hello<scripta>World</scripta> goodbye');
+Hello World goodbye
+SELECT extractTextFromHTML('Hello<script type="text/javascript">World</script> goodbye');
 Hello goodbye
+SELECT extractTextFromHTML('Hello<style type="text/css">World</style> goodbye');
 Hello goodbye
-HelloWorld goodbye
-HelloWorld goodbye
+SELECT extractTextFromHTML('Hello<script:p>World</script:p> goodbye');
+Hello World goodbye
+SELECT extractTextFromHTML('Hello<script:p type="text/javascript">World</script:p> goodbye');
+Hello World goodbye
+SELECT extractTextFromHTML('Hello<style type="text/css">World <!-- abc --> </style> goodbye');
 Hello goodbye
+SELECT extractTextFromHTML('Hello<style type="text/css">World <!-- abc --> </style \n > goodbye');
 Hello goodbye
+SELECT extractTextFromHTML('Hello<style type="text/css">World <!-- abc --> </ style> goodbye');
 Hello
+SELECT extractTextFromHTML('Hello<style type="text/css">World <!-- abc --> </stylea> goodbye');
 Hello
+SELECT extractTextFromHTML('Hello<style type="text/css">World <![CDATA[</style>]]> </stylea> goodbye');
 Hello
+SELECT extractTextFromHTML('Hello<style type="text/css">World <![CDATA[</style>]]> </style> goodbye');
 Hello goodbye
-Hello
+SELECT extractTextFromHTML('Hello<style type="text/css">World <![CDAT[</style>]]> </style> goodbye');
+Hello ]]> goodbye
+SELECT extractTextFromHTML('Hello<style type="text/css">World <![endif]--> </style> goodbye');
 Hello goodbye
+SELECT extractTextFromHTML('Hello<style type="text/css">World <script>abc</script> </stylea> goodbye');
+Hello
+SELECT extractTextFromHTML('Hello<style type="text/css">World <script>abc</script> </style> goodbye');
+Hello goodbye
+SELECT extractTextFromHTML('<![CDATA[]]]]><![CDATA[>]]>');
+]]>
+SELECT extractTextFromHTML('
+<img src="pictures/power.png" style="margin-bottom: -30px;" />
+<br><span style="padding-right: 10px; font-size: 10px;">xkcd.com</span>
+</div>
+');
+xkcd.com
diff --git a/tests/queries/0_stateless/01746_extract_text_from_html.sql b/tests/queries/0_stateless/01746_extract_text_from_html.sql
index 0004849df87..b4ccc775bef 100644
--- a/tests/queries/0_stateless/01746_extract_text_from_html.sql
+++ b/tests/queries/0_stateless/01746_extract_text_from_html.sql
@@ -58,5 +58,15 @@ SELECT extractTextFromHTML('Hello<style type="text/css">World <!-- abc --> </sty
 
 SELECT extractTextFromHTML('Hello<style type="text/css">World <![CDATA[</style>]]> </stylea> goodbye');
 SELECT extractTextFromHTML('Hello<style type="text/css">World <![CDATA[</style>]]> </style> goodbye');
+SELECT extractTextFromHTML('Hello<style type="text/css">World <![CDAT[</style>]]> </style> goodbye');
+SELECT extractTextFromHTML('Hello<style type="text/css">World <![endif]--> </style> goodbye');
 SELECT extractTextFromHTML('Hello<style type="text/css">World <script>abc</script> </stylea> goodbye');
 SELECT extractTextFromHTML('Hello<style type="text/css">World <script>abc</script> </style> goodbye');
+
+SELECT extractTextFromHTML('<![CDATA[]]]]><![CDATA[>]]>');
+
+SELECT extractTextFromHTML('
+<img src="pictures/power.png" style="margin-bottom: -30px;" />
+<br><span style="padding-right: 10px; font-size: 10px;">xkcd.com</span>
+</div>
+');

From 4ab18cdcd8a5eff3f4e386a86361a60f61222e23 Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@yandex-team.ru>
Date: Sun, 28 Feb 2021 00:59:27 +0300
Subject: [PATCH 4/6] Tests and documentation

---
 src/Functions/extractTextFromHTML.cpp | 92 +++++++++++++++++++--------
 1 file changed, 65 insertions(+), 27 deletions(-)

diff --git a/src/Functions/extractTextFromHTML.cpp b/src/Functions/extractTextFromHTML.cpp
index c6a9b84b33e..4b35eacaef0 100644
--- a/src/Functions/extractTextFromHTML.cpp
+++ b/src/Functions/extractTextFromHTML.cpp
@@ -6,6 +6,58 @@
 #include <Common/StringUtils/StringUtils.h>
 
 
+/** A function to extract text from HTML or XHTML.
+  * It does not necessarily 100% conforms to any of the HTML, XML or XHTML standards,
+  * but the implementation is reasonably accurate and it is fast.
+  *
+  * The rules are the following:
+  *
+  * 1. Comments are skipped. Example: <!-- test -->
+  * Comment must end with -->. Nested comments are not possible.
+  * Note: constructions like <!--> <!---> are not valid comments in HTML but will be skipped by other rules.
+  *
+  * 2. CDATA is pasted verbatim.
+  * Note: CDATA is XML/XHTML specific. But we still process it for "best-effort" approach.
+  *
+  * 3. 'script' and 'style' elements are removed with all their content.
+  * Note: it's assumed that closing tag cannot appear inside content.
+  * For example, in JS string literal is has to be escaped as "<\/script>".
+  * Note: comments and CDATA is possible inside script or style - then closing tags are not searched inside CDATA.
+  * Example: <script><![CDATA[</script>]]></script>
+  * But still searched inside comments. Sometimes it becomes complicated:
+  * <script>var x = "<!--"; </script> var y = "-->"; alert(x + y);</script>
+  * Note: script and style can be the names of XML namespaces - then they are not treat like usual script or style.
+  * Example: <script:a>Hello</script:a>.
+  * Note: whitespaces are possible after closing tag name: </script > but not before: < / script>.
+  *
+  * 4. Other tags or tag-like elements are skipped without inner content.
+  * Example: <a>.</a>
+  * Note: it's expected that this HTML is illegal: <a test=">"></a>
+  * Note: it will also skip something like tags: <>, <!>, etc.
+  * Note: tag without end will be skipped to the end of input: <hello
+  * >
+  * 5. HTML and XML entities are not decoded.
+  * It should be processed by separate function.
+  *
+  * 6. Whitespaces in text are collapsed or inserted by specific rules.
+  * Whitespaces at beginning and at the end are removed.
+  * Consecutive whitespaces are collapsed.
+  * But if text is separated by other elements and there is no whitespace, it is inserted.
+  * It may be unnatural, examples: Hello<b>world</b>, Hello<!-- -->world
+  * - in HTML there will be no whitespace, but the function will insert it.
+  * But also consider: Hello<p>world</p>, Hello<br>world.
+  * This behaviour is reasonable for data analysis, e.g. convert HTML to a bag of words.
+  *
+  * 7. Also note that correct handling of whitespaces would require
+  * support of <pre></pre> and CSS display and white-space properties.
+  *
+  * Usage example:
+  *
+  * SELECT extractTextFromHTML(html) FROM url('https://yandex.ru/', RawBLOB, 'html String')
+  *
+  * - ClickHouse has embedded web browser.
+  */
+
 namespace DB
 {
 
@@ -56,18 +108,11 @@ bool processComment(const char * __restrict & src, const char * end)
     return true;
 }
 
-bool processCDATA(const char * __restrict & src, const char * end, char * __restrict & dst, bool & pending_whitespace)
+bool processCDATA(const char * __restrict & src, const char * end, char * __restrict & dst)
 {
     if (!checkAndSkip(src, end, "<![CDATA["))
         return false;
 
-    if (dst && pending_whitespace && src < end)
-    {
-        pending_whitespace = false;
-        *dst = ' ';
-        ++dst;
-    }
-
     const char * gt = src;
     while (true)
     {
@@ -131,14 +176,14 @@ bool processElementAndSkipContent(const char * __restrict & src, const char * en
 
         ++src;
 
-        /// Skip comments and CDATA
+        /// Skip CDATA
         if (*src == '!')
         {
             --src;
-            bool pending_whitespace = false;
             char * dst = nullptr;
-            processComment(src, end) || processCDATA(src, end, dst, pending_whitespace);
-            continue;
+            if (processCDATA(src, end, dst))
+                continue;
+            ++src;
         }
 
         if (*src != '/')
@@ -178,23 +223,19 @@ bool skipTag(const char * __restrict & src, const char * end)
     return false;
 }
 
-void copyText(const char * __restrict & src, const char * end, char * __restrict & dst, bool & pending_whitespace)
+void copyText(const char * __restrict & src, const char * end, char * __restrict & dst, bool needs_whitespace)
 {
     while (src < end && isWhitespaceASCII(*src))
-    {
-        pending_whitespace = true;
         ++src;
-    }
 
-    if (pending_whitespace && src < end)
+    const char * lt = find_first_symbols<'<'>(src, end);
+
+    if (needs_whitespace && src < lt)
     {
-        pending_whitespace = false;
         *dst = ' ';
         ++dst;
     }
 
-    const char * lt = find_first_symbols<'<'>(src, end);
-
     while (true)
     {
         const char * ws = find_first_symbols<' ', '\t', '\n', '\r', '\f', '\v'>(src, lt);
@@ -204,10 +245,7 @@ void copyText(const char * __restrict & src, const char * end, char * __restrict
 
         src = ws;
         while (src < lt && isWhitespaceASCII(*src))
-        {
-            pending_whitespace = true;
             ++src;
-        }
 
         if (src < lt)
         {
@@ -232,16 +270,16 @@ size_t extract(const char * __restrict src, size_t size, char * __restrict dst)
       * - CDATA should be copied verbatim;
       */
 
-    char * dst_begin = dst;
     const char * end = src + size;
-    bool pending_whitespace = false;
+    char * dst_begin = dst;
 
     while (src < end)
     {
-        copyText(src, end, dst, pending_whitespace);
+        bool needs_whitespace = dst != dst_begin && dst[-1] != ' ';
+        copyText(src, end, dst, needs_whitespace);
 
         processComment(src, end)
-            || processCDATA(src, end, dst, pending_whitespace)
+            || processCDATA(src, end, dst)
             || processElementAndSkipContent(src, end, "script")
             || processElementAndSkipContent(src, end, "style")
             || skipTag(src, end);

From e5ae9cbb6365dcf2122672e6587a95f19ebbd187 Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@yandex-team.ru>
Date: Sun, 28 Feb 2021 04:03:22 +0300
Subject: [PATCH 5/6] Fix Arcadia

---
 src/Functions/ya.make | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Functions/ya.make b/src/Functions/ya.make
index 20ba5f846a3..f8beaa8540c 100644
--- a/src/Functions/ya.make
+++ b/src/Functions/ya.make
@@ -246,6 +246,7 @@ SRCS(
     extractAllGroupsHorizontal.cpp
     extractAllGroupsVertical.cpp
     extractGroups.cpp
+    extractTextFromHTML.cpp
     extractTimeZoneFromFunctionArguments.cpp
     filesystem.cpp
     finalizeAggregation.cpp
@@ -291,7 +292,6 @@ SRCS(
     hasToken.cpp
     hasTokenCaseInsensitive.cpp
     hostName.cpp
-    htmlOrXmlCoarseParse.cpp
     hypot.cpp
     identity.cpp
     if.cpp

From ae9fea1d0af118a8f87b224d194d61da1567188b Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@yandex-team.ru>
Date: Sun, 28 Feb 2021 04:05:04 +0300
Subject: [PATCH 6/6] Fix gcc and clang-tidy

---
 src/Functions/extractTextFromHTML.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/Functions/extractTextFromHTML.cpp b/src/Functions/extractTextFromHTML.cpp
index 4b35eacaef0..528bd0c311f 100644
--- a/src/Functions/extractTextFromHTML.cpp
+++ b/src/Functions/extractTextFromHTML.cpp
@@ -70,12 +70,12 @@ namespace ErrorCodes
 namespace
 {
 
-ALWAYS_INLINE bool startsWith(const char * s, const char * end, const char * prefix)
+inline bool startsWith(const char * s, const char * end, const char * prefix)
 {
     return s + strlen(prefix) < end && 0 == memcmp(s, prefix, strlen(prefix));
 }
 
-ALWAYS_INLINE bool checkAndSkip(const char * __restrict & s, const char * end, const char * prefix)
+inline bool checkAndSkip(const char * __restrict & s, const char * end, const char * prefix)
 {
     if (startsWith(s, end, prefix))
     {
@@ -140,7 +140,7 @@ bool processCDATA(const char * __restrict & src, const char * end, char * __rest
 
 bool processElementAndSkipContent(const char * __restrict & src, const char * end, const char * tag_name)
 {
-    auto old_src = src;
+    const auto * old_src = src;
 
     if (!(src < end && *src == '<'))
         return false;