ClickHouse/src/Functions/extractTextFromHTML.cpp

#include <Columns/ColumnString.h>
#include <Functions/FunctionFactory.h>
#include <Functions/FunctionHelpers.h>
#include <Functions/IFunction.h>
#include <base/find_symbols.h>
#include <Common/StringUtils/StringUtils.h>


/** A function to extract text from HTML or XHTML.
  * It does not necessarily 100% conforms to any of the HTML, XML or XHTML standards,
  * but the implementation is reasonably accurate and it is fast.
  *
  * The rules are the following:
  *
  * 1. Comments are skipped. Example: <!-- test -->
  * Comment must end with -->. Nested comments are not possible.
  * Note: constructions like <!--> <!---> are not valid comments in HTML but will be skipped by other rules.
  *
  * 2. CDATA is pasted verbatim.
  * Note: CDATA is XML/XHTML specific. But we still process it for "best-effort" approach.
  *
  * 3. 'script' and 'style' elements are removed with all their content.
  * Note: it's assumed that closing tag cannot appear inside content.
  * For example, in JS string literal is has to be escaped as "<\/script>".
  * Note: comments and CDATA is possible inside script or style - then closing tags are not searched inside CDATA.
  * Example: <script><![CDATA[</script>]]></script>
  * But still searched inside comments. Sometimes it becomes complicated:
  * <script>var x = "<!--"; </script> var y = "-->"; alert(x + y);</script>
  * Note: script and style can be the names of XML namespaces - then they are not treat like usual script or style.
  * Example: <script:a>Hello</script:a>.
  * Note: whitespaces are possible after closing tag name: </script > but not before: < / script>.
  *
  * 4. Other tags or tag-like elements are skipped without inner content.
  * Example: <a>.</a>
  * Note: it's expected that this HTML is illegal: <a test=">"></a>
  * Note: it will also skip something like tags: <>, <!>, etc.
  * Note: tag without end will be skipped to the end of input: <hello
  * >
  * 5. HTML and XML entities are not decoded.
  * It should be processed by separate function.
  *
  * 6. Whitespaces in text are collapsed or inserted by specific rules.
  * Whitespaces at beginning and at the end are removed.
  * Consecutive whitespaces are collapsed.
  * But if text is separated by other elements and there is no whitespace, it is inserted.
  * It may be unnatural, examples: Hello<b>world</b>, Hello<!-- -->world
  * - in HTML there will be no whitespace, but the function will insert it.
  * But also consider: Hello<p>world</p>, Hello<br>world.
  * This behaviour is reasonable for data analysis, e.g. convert HTML to a bag of words.
  *
  * 7. Also note that correct handling of whitespaces would require
  * support of <pre></pre> and CSS display and white-space properties.
  *
  * Usage example:
  *
  * SELECT extractTextFromHTML(html) FROM url('https://github.com/ClickHouse/ClickHouse', RawBLOB, 'html String')
  *
  * - ClickHouse has embedded web browser.
  */

namespace DB
{

namespace ErrorCodes
{
    extern const int ILLEGAL_COLUMN;
    extern const int ILLEGAL_TYPE_OF_ARGUMENT;
}

namespace
{

inline bool startsWith(const char * s, const char * end, const std::string_view prefix)
{
    return s + prefix.length() < end && 0 == memcmp(s, prefix.data(), prefix.length());
}

inline bool checkAndSkip(const char * __restrict & s, const char * end, const std::string_view prefix)
{
    if (startsWith(s, end, prefix))
    {
        s += prefix.length();
        return true;
    }
    return false;
}

bool processComment(const char * __restrict & src, const char * end)
{
    if (!checkAndSkip(src, end, "<!--"))
        return false;

    while (true)
    {
        const char * gt = find_first_symbols<'>'>(src, end);
        if (gt >= end)
            break;

        if (gt > src + strlen("--") && gt[-1] == '-' && gt[-2] == '-')
        {
            src = gt + 1;
            break;
        }

        src = gt + 1;
    }

    return true;
}

bool processCDATA(const char * __restrict & src, const char * end, char * __restrict & dst)
{
    if (!checkAndSkip(src, end, "<![CDATA["))
        return false;

    const char * gt = src;
    while (true)
    {
        gt = find_first_symbols<'>'>(gt, end);
        if (gt >= end)
            break;

        if (gt[-1] == ']' && gt[-2] == ']')
        {
            if (dst)
            {
                size_t bytes_to_copy = gt - src - strlen("]]");
                memcpy(dst, src, bytes_to_copy);
                dst += bytes_to_copy;
            }
            src = gt + 1;
            break;
        }

        ++gt;
    }

    return true;
}

bool processElementAndSkipContent(const char * __restrict & src, const char * end, const std::string_view tag_name)
{
    const auto * old_src = src;

    if (!(src < end && *src == '<'))
        return false;
    ++src;

    if (!checkAndSkip(src, end, tag_name))
    {
        src = old_src;
        return false;
    }

    if (src >= end)
        return false;

    if (!(isWhitespaceASCII(*src) || *src == '>'))
    {
        src = old_src;
        return false;
    }

    const char * gt = find_first_symbols<'>'>(src, end);
    if (gt >= end)
        return false;

    src = gt + 1;

    while (true)
    {
        const char * lt = find_first_symbols<'<'>(src, end);
        src = lt;
        if (src + 1 >= end)
            break;

        ++src;

        /// Skip CDATA
        if (*src == '!')
        {
            --src;
            char * dst = nullptr;
            if (processCDATA(src, end, dst))
                continue;
            ++src;
        }

        if (*src != '/')
            continue;
        ++src;

        if (checkAndSkip(src, end, tag_name))
        {
            while (src < end && isWhitespaceASCII(*src))
                ++src;

            if (src >= end)
                break;

            if (*src == '>')
            {
                ++src;
                break;
            }
        }
    }

    return true;
}

bool skipTag(const char * __restrict & src, const char * end)
{
    if (src < end && *src == '<')
    {
        src = find_first_symbols<'>'>(src, end);
        if (src < end)
            ++src;

        return true;
    }

    return false;
}

void copyText(const char * __restrict & src, const char * end, char * __restrict & dst, bool needs_whitespace)
{
    while (src < end && isWhitespaceASCII(*src))
        ++src;

    const char * lt = find_first_symbols<'<'>(src, end);

    if (needs_whitespace && src < lt)
    {
        *dst = ' ';
        ++dst;
    }

    while (true)
    {
        const char * ws = find_first_symbols<' ', '\t', '\n', '\r', '\f', '\v'>(src, lt);
        size_t bytes_to_copy = ws - src;
        memcpy(dst, src, bytes_to_copy);
        dst += bytes_to_copy;

        src = ws;
        while (src < lt && isWhitespaceASCII(*src))
            ++src;

        if (src < lt)
        {
            *dst = ' ';
            ++dst;
        }
        else
        {
            break;
        }
    }

    src = lt;
}

size_t extract(const char * __restrict src, size_t size, char * __restrict dst)
{
    /** There are the following rules:
      * - comments are removed with all their content;
      * - elements 'script' and 'style' are removed with all their content;
      * - for other elements tags are removed but content is processed as text;
      * - CDATA should be copied verbatim;
      */

    const char * end = src + size;
    char * dst_begin = dst;

    while (src < end)
    {
        bool needs_whitespace = dst != dst_begin && dst[-1] != ' ';
        copyText(src, end, dst, needs_whitespace);

        processComment(src, end)
            || processCDATA(src, end, dst)
            || processElementAndSkipContent(src, end, "script")
            || processElementAndSkipContent(src, end, "style")
            || skipTag(src, end);
    }

    return dst - dst_begin;
}

}


class FunctionExtractTextFromHTML : public IFunction
{
public:
    static constexpr auto name = "extractTextFromHTML";

    static FunctionPtr create(ContextPtr) { return std::make_shared<FunctionExtractTextFromHTML>(); }
    String getName() const override { return name; }
    size_t getNumberOfArguments() const override { return 1; }
    bool useDefaultImplementationForConstants() const override { return true; }
    bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; }

    DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
    {
        if (!isString(arguments[0]))
            throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal type {} of argument of function {}",
                arguments[0]->getName(), getName());
        return arguments[0];
    }

    ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t rows) const override
    {
        const ColumnString * src = checkAndGetColumn<ColumnString>(arguments[0].column.get());
        if (!src)
             throw Exception(ErrorCodes::ILLEGAL_COLUMN, "First argument for function {} must be string.", getName());

        const ColumnString::Chars & src_chars = src->getChars();
        const ColumnString::Offsets & src_offsets = src->getOffsets();

        auto res = ColumnString::create();

        ColumnString::Chars & res_chars = res->getChars();
        ColumnString::Offsets & res_offsets = res->getOffsets();

        res_chars.resize(src_chars.size());
        res_offsets.resize(src_offsets.size());

        ColumnString::Offset src_offset = 0;
        ColumnString::Offset res_offset = 0;

        for (size_t i = 0; i < rows; ++i)
        {
            auto next_src_offset = src_offsets[i];

            res_offset += extract(
                reinterpret_cast<const char *>(&src_chars[src_offset]),
                next_src_offset - src_offset - 1,
                reinterpret_cast<char *>(&res_chars[res_offset]));

            res_chars[res_offset] = 0;
            ++res_offset;
            res_offsets[i] = res_offset;

            src_offset = next_src_offset;
        }

        res_chars.resize(res_offset);
        return res;
    }
};

REGISTER_FUNCTION(ExtractTextFromHTML)
{
    factory.registerFunction<FunctionExtractTextFromHTML>();
}

}