mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-28 02:21:59 +00:00
Merge pull request #21292 from ClickHouse/extract-text-from-html
Rewrite extractTextFromHTML function
This commit is contained in:
commit
467fcbec06
@ -356,7 +356,6 @@ function run_tests
|
|||||||
|
|
||||||
# JSON functions
|
# JSON functions
|
||||||
01666_blns
|
01666_blns
|
||||||
01674_htm_xml_coarse_parse
|
|
||||||
)
|
)
|
||||||
|
|
||||||
(time clickhouse-test --hung-check -j 8 --order=random --use-skip-list --no-long --testname --shard --zookeeper --skip "${TESTS_TO_SKIP[@]}" -- "$FASTTEST_FOCUS" 2>&1 ||:) | ts '%Y-%m-%d %H:%M:%S' | tee "$FASTTEST_OUTPUT/test_log.txt"
|
(time clickhouse-test --hung-check -j 8 --order=random --use-skip-list --no-long --testname --shard --zookeeper --skip "${TESTS_TO_SKIP[@]}" -- "$FASTTEST_FOCUS" 2>&1 ||:) | ts '%Y-%m-%d %H:%M:%S' | tee "$FASTTEST_OUTPUT/test_log.txt"
|
||||||
|
358
src/Functions/extractTextFromHTML.cpp
Normal file
358
src/Functions/extractTextFromHTML.cpp
Normal file
@ -0,0 +1,358 @@
|
|||||||
|
#include <Columns/ColumnString.h>
|
||||||
|
#include <Functions/FunctionFactory.h>
|
||||||
|
#include <Functions/FunctionHelpers.h>
|
||||||
|
#include <Functions/IFunctionImpl.h>
|
||||||
|
#include <common/find_symbols.h>
|
||||||
|
#include <Common/StringUtils/StringUtils.h>
|
||||||
|
|
||||||
|
|
||||||
|
/** A function to extract text from HTML or XHTML.
|
||||||
|
* It does not necessarily 100% conforms to any of the HTML, XML or XHTML standards,
|
||||||
|
* but the implementation is reasonably accurate and it is fast.
|
||||||
|
*
|
||||||
|
* The rules are the following:
|
||||||
|
*
|
||||||
|
* 1. Comments are skipped. Example: <!-- test -->
|
||||||
|
* Comment must end with -->. Nested comments are not possible.
|
||||||
|
* Note: constructions like <!--> <!---> are not valid comments in HTML but will be skipped by other rules.
|
||||||
|
*
|
||||||
|
* 2. CDATA is pasted verbatim.
|
||||||
|
* Note: CDATA is XML/XHTML specific. But we still process it for "best-effort" approach.
|
||||||
|
*
|
||||||
|
* 3. 'script' and 'style' elements are removed with all their content.
|
||||||
|
* Note: it's assumed that closing tag cannot appear inside content.
|
||||||
|
* For example, in JS string literal is has to be escaped as "<\/script>".
|
||||||
|
* Note: comments and CDATA is possible inside script or style - then closing tags are not searched inside CDATA.
|
||||||
|
* Example: <script><![CDATA[</script>]]></script>
|
||||||
|
* But still searched inside comments. Sometimes it becomes complicated:
|
||||||
|
* <script>var x = "<!--"; </script> var y = "-->"; alert(x + y);</script>
|
||||||
|
* Note: script and style can be the names of XML namespaces - then they are not treat like usual script or style.
|
||||||
|
* Example: <script:a>Hello</script:a>.
|
||||||
|
* Note: whitespaces are possible after closing tag name: </script > but not before: < / script>.
|
||||||
|
*
|
||||||
|
* 4. Other tags or tag-like elements are skipped without inner content.
|
||||||
|
* Example: <a>.</a>
|
||||||
|
* Note: it's expected that this HTML is illegal: <a test=">"></a>
|
||||||
|
* Note: it will also skip something like tags: <>, <!>, etc.
|
||||||
|
* Note: tag without end will be skipped to the end of input: <hello
|
||||||
|
* >
|
||||||
|
* 5. HTML and XML entities are not decoded.
|
||||||
|
* It should be processed by separate function.
|
||||||
|
*
|
||||||
|
* 6. Whitespaces in text are collapsed or inserted by specific rules.
|
||||||
|
* Whitespaces at beginning and at the end are removed.
|
||||||
|
* Consecutive whitespaces are collapsed.
|
||||||
|
* But if text is separated by other elements and there is no whitespace, it is inserted.
|
||||||
|
* It may be unnatural, examples: Hello<b>world</b>, Hello<!-- -->world
|
||||||
|
* - in HTML there will be no whitespace, but the function will insert it.
|
||||||
|
* But also consider: Hello<p>world</p>, Hello<br>world.
|
||||||
|
* This behaviour is reasonable for data analysis, e.g. convert HTML to a bag of words.
|
||||||
|
*
|
||||||
|
* 7. Also note that correct handling of whitespaces would require
|
||||||
|
* support of <pre></pre> and CSS display and white-space properties.
|
||||||
|
*
|
||||||
|
* Usage example:
|
||||||
|
*
|
||||||
|
* SELECT extractTextFromHTML(html) FROM url('https://yandex.ru/', RawBLOB, 'html String')
|
||||||
|
*
|
||||||
|
* - ClickHouse has embedded web browser.
|
||||||
|
*/
|
||||||
|
|
||||||
|
namespace DB
|
||||||
|
{
|
||||||
|
|
||||||
|
namespace ErrorCodes
|
||||||
|
{
|
||||||
|
extern const int ILLEGAL_COLUMN;
|
||||||
|
extern const int ILLEGAL_TYPE_OF_ARGUMENT;
|
||||||
|
}
|
||||||
|
|
||||||
|
namespace
|
||||||
|
{
|
||||||
|
|
||||||
|
inline bool startsWith(const char * s, const char * end, const char * prefix)
|
||||||
|
{
|
||||||
|
return s + strlen(prefix) < end && 0 == memcmp(s, prefix, strlen(prefix));
|
||||||
|
}
|
||||||
|
|
||||||
|
inline bool checkAndSkip(const char * __restrict & s, const char * end, const char * prefix)
|
||||||
|
{
|
||||||
|
if (startsWith(s, end, prefix))
|
||||||
|
{
|
||||||
|
s += strlen(prefix);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool processComment(const char * __restrict & src, const char * end)
|
||||||
|
{
|
||||||
|
if (!checkAndSkip(src, end, "<!--"))
|
||||||
|
return false;
|
||||||
|
|
||||||
|
while (true)
|
||||||
|
{
|
||||||
|
const char * gt = find_first_symbols<'>'>(src, end);
|
||||||
|
if (gt >= end)
|
||||||
|
break;
|
||||||
|
|
||||||
|
if (gt > src + strlen("--") && gt[-1] == '-' && gt[-2] == '-')
|
||||||
|
{
|
||||||
|
src = gt + 1;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
src = gt + 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool processCDATA(const char * __restrict & src, const char * end, char * __restrict & dst)
|
||||||
|
{
|
||||||
|
if (!checkAndSkip(src, end, "<![CDATA["))
|
||||||
|
return false;
|
||||||
|
|
||||||
|
const char * gt = src;
|
||||||
|
while (true)
|
||||||
|
{
|
||||||
|
gt = find_first_symbols<'>'>(gt, end);
|
||||||
|
if (gt >= end)
|
||||||
|
break;
|
||||||
|
|
||||||
|
if (gt[-1] == ']' && gt[-2] == ']')
|
||||||
|
{
|
||||||
|
if (dst)
|
||||||
|
{
|
||||||
|
size_t bytes_to_copy = gt - src - strlen("]]");
|
||||||
|
memcpy(dst, src, bytes_to_copy);
|
||||||
|
dst += bytes_to_copy;
|
||||||
|
}
|
||||||
|
src = gt + 1;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
++gt;
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool processElementAndSkipContent(const char * __restrict & src, const char * end, const char * tag_name)
|
||||||
|
{
|
||||||
|
const auto * old_src = src;
|
||||||
|
|
||||||
|
if (!(src < end && *src == '<'))
|
||||||
|
return false;
|
||||||
|
++src;
|
||||||
|
|
||||||
|
if (!checkAndSkip(src, end, tag_name))
|
||||||
|
{
|
||||||
|
src = old_src;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (src >= end)
|
||||||
|
return false;
|
||||||
|
|
||||||
|
if (!(isWhitespaceASCII(*src) || *src == '>'))
|
||||||
|
{
|
||||||
|
src = old_src;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
const char * gt = find_first_symbols<'>'>(src, end);
|
||||||
|
if (gt >= end)
|
||||||
|
return false;
|
||||||
|
|
||||||
|
src = gt + 1;
|
||||||
|
|
||||||
|
while (true)
|
||||||
|
{
|
||||||
|
const char * lt = find_first_symbols<'<'>(src, end);
|
||||||
|
src = lt;
|
||||||
|
if (src + 1 >= end)
|
||||||
|
break;
|
||||||
|
|
||||||
|
++src;
|
||||||
|
|
||||||
|
/// Skip CDATA
|
||||||
|
if (*src == '!')
|
||||||
|
{
|
||||||
|
--src;
|
||||||
|
char * dst = nullptr;
|
||||||
|
if (processCDATA(src, end, dst))
|
||||||
|
continue;
|
||||||
|
++src;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (*src != '/')
|
||||||
|
continue;
|
||||||
|
++src;
|
||||||
|
|
||||||
|
if (checkAndSkip(src, end, tag_name))
|
||||||
|
{
|
||||||
|
while (src < end && isWhitespaceASCII(*src))
|
||||||
|
++src;
|
||||||
|
|
||||||
|
if (src >= end)
|
||||||
|
break;
|
||||||
|
|
||||||
|
if (*src == '>')
|
||||||
|
{
|
||||||
|
++src;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool skipTag(const char * __restrict & src, const char * end)
|
||||||
|
{
|
||||||
|
if (src < end && *src == '<')
|
||||||
|
{
|
||||||
|
src = find_first_symbols<'>'>(src, end);
|
||||||
|
if (src < end)
|
||||||
|
++src;
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
void copyText(const char * __restrict & src, const char * end, char * __restrict & dst, bool needs_whitespace)
|
||||||
|
{
|
||||||
|
while (src < end && isWhitespaceASCII(*src))
|
||||||
|
++src;
|
||||||
|
|
||||||
|
const char * lt = find_first_symbols<'<'>(src, end);
|
||||||
|
|
||||||
|
if (needs_whitespace && src < lt)
|
||||||
|
{
|
||||||
|
*dst = ' ';
|
||||||
|
++dst;
|
||||||
|
}
|
||||||
|
|
||||||
|
while (true)
|
||||||
|
{
|
||||||
|
const char * ws = find_first_symbols<' ', '\t', '\n', '\r', '\f', '\v'>(src, lt);
|
||||||
|
size_t bytes_to_copy = ws - src;
|
||||||
|
memcpy(dst, src, bytes_to_copy);
|
||||||
|
dst += bytes_to_copy;
|
||||||
|
|
||||||
|
src = ws;
|
||||||
|
while (src < lt && isWhitespaceASCII(*src))
|
||||||
|
++src;
|
||||||
|
|
||||||
|
if (src < lt)
|
||||||
|
{
|
||||||
|
*dst = ' ';
|
||||||
|
++dst;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
src = lt;
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t extract(const char * __restrict src, size_t size, char * __restrict dst)
|
||||||
|
{
|
||||||
|
/** There are the following rules:
|
||||||
|
* - comments are removed with all their content;
|
||||||
|
* - elements 'script' and 'style' are removed with all their content;
|
||||||
|
* - for other elements tags are removed but content is processed as text;
|
||||||
|
* - CDATA should be copied verbatim;
|
||||||
|
*/
|
||||||
|
|
||||||
|
const char * end = src + size;
|
||||||
|
char * dst_begin = dst;
|
||||||
|
|
||||||
|
while (src < end)
|
||||||
|
{
|
||||||
|
bool needs_whitespace = dst != dst_begin && dst[-1] != ' ';
|
||||||
|
copyText(src, end, dst, needs_whitespace);
|
||||||
|
|
||||||
|
processComment(src, end)
|
||||||
|
|| processCDATA(src, end, dst)
|
||||||
|
|| processElementAndSkipContent(src, end, "script")
|
||||||
|
|| processElementAndSkipContent(src, end, "style")
|
||||||
|
|| skipTag(src, end);
|
||||||
|
}
|
||||||
|
|
||||||
|
return dst - dst_begin;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class FunctionExtractTextFromHTML : public IFunction
|
||||||
|
{
|
||||||
|
public:
|
||||||
|
static constexpr auto name = "extractTextFromHTML";
|
||||||
|
|
||||||
|
static FunctionPtr create(const Context &) { return std::make_shared<FunctionExtractTextFromHTML>(); }
|
||||||
|
String getName() const override { return name; }
|
||||||
|
size_t getNumberOfArguments() const override { return 1; }
|
||||||
|
bool useDefaultImplementationForConstants() const override { return true; }
|
||||||
|
|
||||||
|
DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
|
||||||
|
{
|
||||||
|
if (!isString(arguments[0]))
|
||||||
|
throw Exception(
|
||||||
|
"Illegal type " + arguments[0]->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
|
||||||
|
return arguments[0];
|
||||||
|
}
|
||||||
|
|
||||||
|
ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t rows) const override
|
||||||
|
{
|
||||||
|
const ColumnString * src = checkAndGetColumn<ColumnString>(arguments[0].column.get());
|
||||||
|
if (!src)
|
||||||
|
throw Exception("First argument for function " + getName() + " must be string.", ErrorCodes::ILLEGAL_COLUMN);
|
||||||
|
|
||||||
|
const ColumnString::Chars & src_chars = src->getChars();
|
||||||
|
const ColumnString::Offsets & src_offsets = src->getOffsets();
|
||||||
|
|
||||||
|
auto res = ColumnString::create();
|
||||||
|
|
||||||
|
ColumnString::Chars & res_chars = res->getChars();
|
||||||
|
ColumnString::Offsets & res_offsets = res->getOffsets();
|
||||||
|
|
||||||
|
res_chars.resize(src_chars.size());
|
||||||
|
res_offsets.resize(src_offsets.size());
|
||||||
|
|
||||||
|
ColumnString::Offset src_offset = 0;
|
||||||
|
ColumnString::Offset res_offset = 0;
|
||||||
|
|
||||||
|
for (size_t i = 0; i < rows; ++i)
|
||||||
|
{
|
||||||
|
auto next_src_offset = src_offsets[i];
|
||||||
|
|
||||||
|
res_offset += extract(
|
||||||
|
reinterpret_cast<const char *>(&src_chars[src_offset]),
|
||||||
|
next_src_offset - src_offset - 1,
|
||||||
|
reinterpret_cast<char *>(&res_chars[res_offset]));
|
||||||
|
|
||||||
|
res_chars[res_offset] = 0;
|
||||||
|
++res_offset;
|
||||||
|
res_offsets[i] = res_offset;
|
||||||
|
|
||||||
|
src_offset = next_src_offset;
|
||||||
|
}
|
||||||
|
|
||||||
|
res_chars.resize(res_offset);
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
void registerFunctionExtractTextFromHTML(FunctionFactory & factory)
|
||||||
|
{
|
||||||
|
factory.registerFunction<FunctionExtractTextFromHTML>();
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -1,582 +0,0 @@
|
|||||||
#include <Columns/ColumnString.h>
|
|
||||||
#include <Functions/FunctionFactory.h>
|
|
||||||
#include <Functions/FunctionHelpers.h>
|
|
||||||
#include <Functions/IFunctionImpl.h>
|
|
||||||
|
|
||||||
#include <utility>
|
|
||||||
#include <vector>
|
|
||||||
#include <algorithm>
|
|
||||||
|
|
||||||
#if USE_HYPERSCAN
|
|
||||||
# include <hs.h>
|
|
||||||
|
|
||||||
namespace DB
|
|
||||||
{
|
|
||||||
namespace ErrorCodes
|
|
||||||
{
|
|
||||||
extern const int ILLEGAL_COLUMN;
|
|
||||||
extern const int ILLEGAL_TYPE_OF_ARGUMENT;
|
|
||||||
extern const int CANNOT_ALLOCATE_MEMORY;
|
|
||||||
extern const int NOT_IMPLEMENTED;
|
|
||||||
}
|
|
||||||
|
|
||||||
namespace
|
|
||||||
{
|
|
||||||
struct HxCoarseParseImpl
|
|
||||||
{
|
|
||||||
private:
|
|
||||||
struct SpanInfo
|
|
||||||
{
|
|
||||||
SpanInfo(): id(0), match_space(std::pair<unsigned long long, unsigned long long>(0, 0)) {} // NOLINT
|
|
||||||
SpanInfo(unsigned int matchId, std::pair<unsigned long long, unsigned long long> matchSpan): id(matchId), match_space(matchSpan){} // NOLINT
|
|
||||||
SpanInfo(const SpanInfo& obj)
|
|
||||||
{
|
|
||||||
id = obj.id;
|
|
||||||
match_space = obj.match_space;
|
|
||||||
}
|
|
||||||
SpanInfo& operator=(const SpanInfo& obj) = default;
|
|
||||||
|
|
||||||
unsigned int id;
|
|
||||||
std::pair<unsigned long long, unsigned long long> match_space; // NOLINT
|
|
||||||
};
|
|
||||||
using SpanElement = std::vector<SpanInfo>;
|
|
||||||
struct Span
|
|
||||||
{
|
|
||||||
Span(): set_script(false), set_style(false), set_semi(false), is_finding_cdata(false) {}
|
|
||||||
|
|
||||||
SpanElement copy_stack; // copy area
|
|
||||||
SpanElement tag_stack; // regexp area
|
|
||||||
SpanInfo script_ptr; // script pointer
|
|
||||||
bool set_script; // whether set script
|
|
||||||
SpanInfo style_ptr; // style pointer
|
|
||||||
bool set_style; // whether set style
|
|
||||||
SpanInfo semi_ptr; // tag ptr
|
|
||||||
bool set_semi; // whether set semi
|
|
||||||
|
|
||||||
bool is_finding_cdata;
|
|
||||||
};
|
|
||||||
|
|
||||||
static inline void copyZone(
|
|
||||||
ColumnString::Offset& current_dst_string_offset,
|
|
||||||
ColumnString::Offset& current_copy_loc,
|
|
||||||
ColumnString::Chars& dst_chars,
|
|
||||||
const ColumnString::Chars& src_chars,
|
|
||||||
size_t bytes_to_copy,
|
|
||||||
unsigned is_space
|
|
||||||
)
|
|
||||||
{
|
|
||||||
bool is_last_space = false;
|
|
||||||
if (current_dst_string_offset == 0 || dst_chars[current_dst_string_offset - 1] == 0 || dst_chars[current_dst_string_offset - 1] == ' ')
|
|
||||||
{
|
|
||||||
is_last_space = true;
|
|
||||||
}
|
|
||||||
if (bytes_to_copy == 0)
|
|
||||||
{
|
|
||||||
if (is_space && !is_last_space)
|
|
||||||
{
|
|
||||||
dst_chars[current_dst_string_offset++] = ' ';
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
if (is_last_space && src_chars[current_copy_loc] == ' ')
|
|
||||||
{
|
|
||||||
--bytes_to_copy;
|
|
||||||
++current_copy_loc;
|
|
||||||
}
|
|
||||||
if (bytes_to_copy > 0)
|
|
||||||
{
|
|
||||||
memcpySmallAllowReadWriteOverflow15(
|
|
||||||
&dst_chars[current_dst_string_offset], &src_chars[current_copy_loc], bytes_to_copy);
|
|
||||||
current_dst_string_offset += bytes_to_copy;
|
|
||||||
}
|
|
||||||
|
|
||||||
// separator is space and last character is not space.
|
|
||||||
if (is_space && !(current_dst_string_offset == 0 || dst_chars[current_dst_string_offset - 1] == 0 || dst_chars[current_dst_string_offset - 1] == ' '))
|
|
||||||
{
|
|
||||||
dst_chars[current_dst_string_offset++] = ' ';
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// return;
|
|
||||||
}
|
|
||||||
static inline void popArea(SpanElement& stack, unsigned long long from, unsigned long long to) //NOLINT
|
|
||||||
{
|
|
||||||
while (!stack.empty())
|
|
||||||
{
|
|
||||||
if (to > stack.back().match_space.second && from < stack.back().match_space.second)
|
|
||||||
{
|
|
||||||
stack.pop_back();
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// return;
|
|
||||||
}
|
|
||||||
|
|
||||||
static void dealCommonTag(Span* matches)
|
|
||||||
{
|
|
||||||
while (!matches->copy_stack.empty() && matches->copy_stack.back().id != 10)
|
|
||||||
{
|
|
||||||
matches->copy_stack.pop_back();
|
|
||||||
}
|
|
||||||
if (!matches->copy_stack.empty())
|
|
||||||
{
|
|
||||||
matches->copy_stack.pop_back();
|
|
||||||
}
|
|
||||||
unsigned long long from; // NOLINT
|
|
||||||
unsigned long long to; // NOLINT
|
|
||||||
unsigned id;
|
|
||||||
for (auto begin = matches->tag_stack.begin(); begin != matches->tag_stack.end(); ++begin)
|
|
||||||
{
|
|
||||||
from = begin->match_space.first;
|
|
||||||
to = begin->match_space.second;
|
|
||||||
id = begin->id;
|
|
||||||
switch (id)
|
|
||||||
{
|
|
||||||
case 12:
|
|
||||||
case 13:
|
|
||||||
{
|
|
||||||
popArea(matches->copy_stack, from, to);
|
|
||||||
if (matches->copy_stack.empty() || from >= matches->copy_stack.back().match_space.second)
|
|
||||||
matches->copy_stack.push_back(SpanInfo(id, std::make_pair(from, to)));
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
case 0:
|
|
||||||
case 2:
|
|
||||||
case 3:
|
|
||||||
case 4:
|
|
||||||
case 5:
|
|
||||||
case 6:
|
|
||||||
case 7:
|
|
||||||
case 8:
|
|
||||||
case 9:
|
|
||||||
case 10:
|
|
||||||
{
|
|
||||||
if (!matches->set_semi || (matches->set_semi && from == matches->semi_ptr.match_space.first))
|
|
||||||
{
|
|
||||||
matches->set_semi = true;
|
|
||||||
matches->semi_ptr = SpanInfo(id, std::make_pair(from, to));
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
case 1:
|
|
||||||
{
|
|
||||||
if (matches->set_semi)
|
|
||||||
{
|
|
||||||
switch (matches->semi_ptr.id)
|
|
||||||
{
|
|
||||||
case 0:
|
|
||||||
case 2:
|
|
||||||
case 3:
|
|
||||||
case 6:
|
|
||||||
case 7:
|
|
||||||
case 10:
|
|
||||||
{
|
|
||||||
if (matches->semi_ptr.id == 2 || (matches->semi_ptr.id == 3 && matches->semi_ptr.match_space.second == from))
|
|
||||||
{
|
|
||||||
if (!matches->set_script)
|
|
||||||
{
|
|
||||||
matches->set_script = true;
|
|
||||||
matches->script_ptr = SpanInfo(matches->semi_ptr.id, std::make_pair(matches->semi_ptr.match_space.first, to));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else if (matches->semi_ptr.id == 6 || (matches->semi_ptr.id == 7 && matches->semi_ptr.match_space.second == from))
|
|
||||||
{
|
|
||||||
if (!matches->set_style)
|
|
||||||
{
|
|
||||||
matches->set_style = true;
|
|
||||||
matches->style_ptr = SpanInfo(matches->semi_ptr.id, std::make_pair(matches->semi_ptr.match_space.first, to));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
popArea(matches->copy_stack, matches->semi_ptr.match_space.first, to);
|
|
||||||
matches->copy_stack.push_back(SpanInfo(0, std::make_pair(matches->semi_ptr.match_space.first, to)));
|
|
||||||
matches->set_semi = false;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
case 4:
|
|
||||||
case 5:
|
|
||||||
case 8:
|
|
||||||
case 9:
|
|
||||||
{
|
|
||||||
SpanInfo complete_zone;
|
|
||||||
|
|
||||||
complete_zone.match_space.second = to;
|
|
||||||
if (matches->set_script && (matches->semi_ptr.id == 4 || (matches->semi_ptr.id == 5 && matches->semi_ptr.match_space.second == from)))
|
|
||||||
{
|
|
||||||
complete_zone.id = matches->script_ptr.id;
|
|
||||||
complete_zone.match_space.first = matches->script_ptr.match_space.first;
|
|
||||||
matches->set_script = false;
|
|
||||||
}
|
|
||||||
else if (matches->set_style && (matches->semi_ptr.id == 8 || (matches->semi_ptr.id == 9 && matches->semi_ptr.match_space.second == from)))
|
|
||||||
{
|
|
||||||
complete_zone.id = matches->style_ptr.id;
|
|
||||||
complete_zone.match_space.first = matches->style_ptr.match_space.first;
|
|
||||||
matches->set_style = false;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
complete_zone.id = matches->semi_ptr.id;
|
|
||||||
complete_zone.match_space.first = matches->semi_ptr.match_space.first;
|
|
||||||
}
|
|
||||||
popArea(matches->copy_stack, complete_zone.match_space.first, complete_zone.match_space.second);
|
|
||||||
matches->copy_stack.push_back(complete_zone);
|
|
||||||
matches->set_semi = false;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
default:
|
|
||||||
{
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// return;
|
|
||||||
}
|
|
||||||
static int spanCollect(unsigned int id,
|
|
||||||
unsigned long long from, // NOLINT
|
|
||||||
unsigned long long to, // NOLINT
|
|
||||||
unsigned int , void * ctx)
|
|
||||||
{
|
|
||||||
Span* matches = static_cast<Span*>(ctx);
|
|
||||||
from = id == 12 ? from : to - patterns_length[id];
|
|
||||||
|
|
||||||
if (matches->is_finding_cdata)
|
|
||||||
{
|
|
||||||
if (id == 11)
|
|
||||||
{
|
|
||||||
matches->copy_stack.push_back(SpanInfo(id, std::make_pair(from, to)));
|
|
||||||
matches->is_finding_cdata = false;
|
|
||||||
matches->tag_stack.clear();
|
|
||||||
if (matches->semi_ptr.id == 10)
|
|
||||||
{
|
|
||||||
matches->set_semi = false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else if (id == 12 || id == 13)
|
|
||||||
{
|
|
||||||
popArea(matches->copy_stack, from, to);
|
|
||||||
if (matches->copy_stack.empty() || from >= matches->copy_stack.back().match_space.second)
|
|
||||||
matches->copy_stack.push_back(SpanInfo(id, std::make_pair(from, to)));
|
|
||||||
|
|
||||||
popArea(matches->tag_stack, from, to);
|
|
||||||
if (matches->tag_stack.empty() || from >= matches->tag_stack.back().match_space.second)
|
|
||||||
matches->tag_stack.push_back(SpanInfo(id, std::make_pair(from, to)));
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
popArea(matches->tag_stack, from, to);
|
|
||||||
matches->tag_stack.push_back(SpanInfo(id, std::make_pair(from, to)));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
switch (id)
|
|
||||||
{
|
|
||||||
case 12:
|
|
||||||
case 13:
|
|
||||||
{
|
|
||||||
popArea(matches->copy_stack, from, to);
|
|
||||||
if (matches->copy_stack.empty() || from >= matches->copy_stack.back().match_space.second)
|
|
||||||
matches->copy_stack.push_back(SpanInfo(id, std::make_pair(from, to)));
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
case 0:
|
|
||||||
case 2:
|
|
||||||
case 3:
|
|
||||||
case 4:
|
|
||||||
case 5:
|
|
||||||
case 6:
|
|
||||||
case 7:
|
|
||||||
case 8:
|
|
||||||
case 9:
|
|
||||||
{
|
|
||||||
if (!matches->set_semi || (matches->set_semi && from == matches->semi_ptr.match_space.first))
|
|
||||||
{
|
|
||||||
matches->set_semi = true;
|
|
||||||
matches->semi_ptr = SpanInfo(id, std::make_pair(from, to));
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
case 10:
|
|
||||||
{
|
|
||||||
if (!matches->set_semi || (matches->set_semi && from == matches->semi_ptr.match_space.first))
|
|
||||||
{
|
|
||||||
matches->set_semi = true;
|
|
||||||
matches->semi_ptr = SpanInfo(id, std::make_pair(from, to));
|
|
||||||
}
|
|
||||||
matches->is_finding_cdata = true;
|
|
||||||
matches->copy_stack.push_back(SpanInfo(id, std::make_pair(from, to)));
|
|
||||||
matches->tag_stack.push_back(SpanInfo(id, std::make_pair(from, to)));
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
case 1:
|
|
||||||
{
|
|
||||||
if (matches->set_semi)
|
|
||||||
{
|
|
||||||
switch (matches->semi_ptr.id)
|
|
||||||
{
|
|
||||||
case 0:
|
|
||||||
case 2:
|
|
||||||
case 3:
|
|
||||||
case 6:
|
|
||||||
case 7:
|
|
||||||
case 10:
|
|
||||||
{
|
|
||||||
if (matches->semi_ptr.id == 2 || (matches->semi_ptr.id == 3 && matches->semi_ptr.match_space.second == from))
|
|
||||||
{
|
|
||||||
if (!matches->set_script)
|
|
||||||
{
|
|
||||||
matches->set_script = true;
|
|
||||||
matches->script_ptr = SpanInfo(matches->semi_ptr.id, std::make_pair(matches->semi_ptr.match_space.first, to));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else if (matches->semi_ptr.id == 6 || (matches->semi_ptr.id == 7 && matches->semi_ptr.match_space.second == from))
|
|
||||||
{
|
|
||||||
if (!matches->set_style)
|
|
||||||
{
|
|
||||||
matches->set_style = true;
|
|
||||||
matches->style_ptr = SpanInfo(matches->semi_ptr.id, std::make_pair(matches->semi_ptr.match_space.first, to));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
popArea(matches->copy_stack, matches->semi_ptr.match_space.first, to);
|
|
||||||
matches->copy_stack.push_back(SpanInfo(matches->semi_ptr.id, std::make_pair(matches->semi_ptr.match_space.first, to)));
|
|
||||||
matches->set_semi = false;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
case 4:
|
|
||||||
case 5:
|
|
||||||
case 8:
|
|
||||||
case 9:
|
|
||||||
{
|
|
||||||
SpanInfo complete_zone;
|
|
||||||
complete_zone.match_space.second = to;
|
|
||||||
if (matches->set_script && (matches->semi_ptr.id == 4 || (matches->semi_ptr.id == 5 && matches->semi_ptr.match_space.second == from)))
|
|
||||||
{
|
|
||||||
complete_zone.id = matches->script_ptr.id;
|
|
||||||
complete_zone.match_space.first = matches->script_ptr.match_space.first;
|
|
||||||
matches->set_script = false;
|
|
||||||
}
|
|
||||||
else if (matches->set_style && (matches->semi_ptr.id == 8 || (matches->semi_ptr.id == 9 && matches->semi_ptr.match_space.second == from)))
|
|
||||||
{
|
|
||||||
complete_zone.id = matches->style_ptr.id;
|
|
||||||
complete_zone.match_space.first = matches->style_ptr.match_space.first;
|
|
||||||
matches->set_style = false;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
complete_zone.id = matches->semi_ptr.id;
|
|
||||||
complete_zone.match_space.first = matches->semi_ptr.match_space.first;
|
|
||||||
}
|
|
||||||
popArea(matches->copy_stack, complete_zone.match_space.first, complete_zone.match_space.second);
|
|
||||||
matches->copy_stack.push_back(complete_zone);
|
|
||||||
matches->set_semi = false;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
default:
|
|
||||||
{
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
#if USE_HYPERSCAN
|
|
||||||
static hs_database_t* buildDatabase(const std::vector<const char* > &expressions,
|
|
||||||
const std::vector<unsigned> &flags,
|
|
||||||
const std::vector<unsigned> &id,
|
|
||||||
unsigned int mode)
|
|
||||||
{
|
|
||||||
hs_database_t *db;
|
|
||||||
hs_compile_error_t *compile_err;
|
|
||||||
hs_error_t err;
|
|
||||||
err = hs_compile_multi(expressions.data(), flags.data(), id.data(),
|
|
||||||
expressions.size(), mode, nullptr, &db, &compile_err);
|
|
||||||
|
|
||||||
if (err != HS_SUCCESS)
|
|
||||||
{
|
|
||||||
hs_free_compile_error(compile_err);
|
|
||||||
throw Exception("Hyper scan database cannot be compiled.", ErrorCodes::CANNOT_ALLOCATE_MEMORY);
|
|
||||||
}
|
|
||||||
return db;
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
static std::vector<const char*> patterns;
|
|
||||||
static std::vector<std::size_t> patterns_length;
|
|
||||||
static std::vector<unsigned> patterns_flag;
|
|
||||||
static std::vector<unsigned> ids;
|
|
||||||
|
|
||||||
public:
|
|
||||||
static void executeInternal(
|
|
||||||
const ColumnString::Chars & src_chars,
|
|
||||||
const ColumnString::Offsets & src_offsets,
|
|
||||||
ColumnString::Chars & dst_chars,
|
|
||||||
ColumnString::Offsets & dst_offsets)
|
|
||||||
{
|
|
||||||
#if USE_HYPERSCAN
|
|
||||||
hs_database_t * db = buildDatabase(patterns, patterns_flag, ids, HS_MODE_BLOCK);
|
|
||||||
hs_scratch_t* scratch = nullptr;
|
|
||||||
if (hs_alloc_scratch(db, &scratch) != HS_SUCCESS)
|
|
||||||
{
|
|
||||||
hs_free_database(db);
|
|
||||||
throw Exception("Unable to allocate scratch space.", ErrorCodes::CANNOT_ALLOCATE_MEMORY);
|
|
||||||
}
|
|
||||||
dst_chars.resize(src_chars.size());
|
|
||||||
dst_offsets.resize(src_offsets.size());
|
|
||||||
|
|
||||||
ColumnString::Offset current_src_string_offset = 0;
|
|
||||||
ColumnString::Offset current_dst_string_offset = 0;
|
|
||||||
ColumnString::Offset current_copy_loc;
|
|
||||||
ColumnString::Offset current_copy_end;
|
|
||||||
unsigned is_space;
|
|
||||||
size_t bytes_to_copy;
|
|
||||||
Span match_zoneall;
|
|
||||||
|
|
||||||
for (size_t off = 0; off < src_offsets.size(); ++off)
|
|
||||||
{
|
|
||||||
hs_scan(db, reinterpret_cast<const char *>(&src_chars[current_src_string_offset]), src_offsets[off] - current_src_string_offset, 0, scratch, spanCollect, &match_zoneall);
|
|
||||||
if (match_zoneall.is_finding_cdata)
|
|
||||||
{
|
|
||||||
dealCommonTag(&match_zoneall);
|
|
||||||
}
|
|
||||||
SpanElement& match_zone = match_zoneall.copy_stack;
|
|
||||||
current_copy_loc = current_src_string_offset;
|
|
||||||
if (match_zone.empty())
|
|
||||||
{
|
|
||||||
current_copy_end = src_offsets[off];
|
|
||||||
is_space = 0;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
current_copy_end = current_src_string_offset + match_zone.begin()->match_space.first;
|
|
||||||
is_space = (match_zone.begin()->id == 12 || match_zone.begin()->id == 13)?1:0;
|
|
||||||
}
|
|
||||||
|
|
||||||
bytes_to_copy = current_copy_end - current_copy_loc;
|
|
||||||
copyZone(current_dst_string_offset, current_copy_loc, dst_chars, src_chars, bytes_to_copy, is_space);
|
|
||||||
for (auto begin = match_zone.begin(); begin != match_zone.end(); ++begin)
|
|
||||||
{
|
|
||||||
current_copy_loc = current_src_string_offset + begin->match_space.second;
|
|
||||||
if (begin + 1 >= match_zone.end())
|
|
||||||
{
|
|
||||||
current_copy_end = src_offsets[off];
|
|
||||||
is_space = 0;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
current_copy_end = current_src_string_offset + (begin+1)->match_space.first;
|
|
||||||
is_space = ((begin+1)->id == 12 || (begin+1)->id == 13)?1:0;
|
|
||||||
}
|
|
||||||
bytes_to_copy = current_copy_end - current_copy_loc;
|
|
||||||
copyZone(current_dst_string_offset, current_copy_loc, dst_chars, src_chars, bytes_to_copy, is_space);
|
|
||||||
}
|
|
||||||
if (current_dst_string_offset > 1 && dst_chars[current_dst_string_offset - 2] == ' ')
|
|
||||||
{
|
|
||||||
dst_chars[current_dst_string_offset - 2] = 0;
|
|
||||||
--current_dst_string_offset;
|
|
||||||
}
|
|
||||||
dst_offsets[off] = current_dst_string_offset;
|
|
||||||
current_src_string_offset = src_offsets[off];
|
|
||||||
match_zoneall.copy_stack.clear();
|
|
||||||
match_zoneall.tag_stack.clear();
|
|
||||||
}
|
|
||||||
dst_chars.resize(dst_chars.size());
|
|
||||||
hs_free_scratch(scratch);
|
|
||||||
hs_free_database(db);
|
|
||||||
#else
|
|
||||||
(void)src_chars;
|
|
||||||
(void)src_offsets;
|
|
||||||
(void)dst_chars;
|
|
||||||
(void)dst_offsets;
|
|
||||||
throw Exception(
|
|
||||||
"htmlOrXmlCoarseParse is not implemented when hyperscan is off (is it x86 processor?)",
|
|
||||||
ErrorCodes::NOT_IMPLEMENTED);
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
std::vector<const char*> HxCoarseParseImpl::patterns =
|
|
||||||
{
|
|
||||||
"<[^\\s<>]", // 0 "<", except "< ", "<<", "<>"
|
|
||||||
">", // 1 ">"
|
|
||||||
"<script\\s", // 2 <script xxxxx>
|
|
||||||
"<script", // 3 <script>
|
|
||||||
"</script\\s", // 4 </script xxxx>
|
|
||||||
"</script", // 5 </script>
|
|
||||||
"<style\\s", // 6 <style xxxxxx>
|
|
||||||
"<style", // 7 <style>
|
|
||||||
"</style\\s", // 8 </style xxxxx>
|
|
||||||
"</style", // 9 </style>
|
|
||||||
"<!\\[CDATA\\[", // 10 <![CDATA[xxxxxx]]>
|
|
||||||
"\\]\\]>", // 11 ]]>
|
|
||||||
"\\s{2,}", // 12 " ", continuous blanks
|
|
||||||
"[^\\S ]" // 13 "\n", "\t" and other white space, it does not include single ' '.
|
|
||||||
};
|
|
||||||
std::vector<std::size_t> HxCoarseParseImpl::patterns_length =
|
|
||||||
{
|
|
||||||
2, 1, 8, 7, 9, 8, 7, 6, 8, 7, 9, 3, 0, 1
|
|
||||||
};
|
|
||||||
#if USE_HYPERSCAN
|
|
||||||
std::vector<unsigned> HxCoarseParseImpl::patterns_flag =
|
|
||||||
{
|
|
||||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, HS_FLAG_SOM_LEFTMOST, 0
|
|
||||||
};
|
|
||||||
#endif
|
|
||||||
std::vector<unsigned> HxCoarseParseImpl::ids =
|
|
||||||
{
|
|
||||||
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13
|
|
||||||
};
|
|
||||||
|
|
||||||
class FunctionHtmlOrXmlCoarseParse : public IFunction
|
|
||||||
{
|
|
||||||
public:
|
|
||||||
static constexpr auto name = "htmlOrXmlCoarseParse";
|
|
||||||
|
|
||||||
static FunctionPtr create(const Context &) {return std::make_shared<FunctionHtmlOrXmlCoarseParse>(); }
|
|
||||||
|
|
||||||
String getName() const override {return name;}
|
|
||||||
|
|
||||||
size_t getNumberOfArguments() const override {return 1;}
|
|
||||||
|
|
||||||
DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
|
|
||||||
{
|
|
||||||
if (!isString(arguments[0]))
|
|
||||||
throw Exception(
|
|
||||||
"Illegal type " + arguments[0]->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
|
|
||||||
return arguments[0];
|
|
||||||
}
|
|
||||||
|
|
||||||
bool useDefaultImplementationForConstants() const override {return true;}
|
|
||||||
|
|
||||||
ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & , size_t) const override
|
|
||||||
{
|
|
||||||
const auto & strcolumn = arguments[0].column;
|
|
||||||
if (const ColumnString* html_sentence = checkAndGetColumn<ColumnString>(strcolumn.get()))
|
|
||||||
{
|
|
||||||
auto col_res = ColumnString::create();
|
|
||||||
HxCoarseParseImpl::executeInternal(html_sentence->getChars(), html_sentence->getOffsets(), col_res->getChars(), col_res->getOffsets());
|
|
||||||
return col_res;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
throw Exception("First argument for function " + getName() + " must be string.", ErrorCodes::ILLEGAL_COLUMN);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
void registerFunctionHtmlOrXmlCoarseParse(FunctionFactory & factory)
|
|
||||||
{
|
|
||||||
factory.registerFunction<FunctionHtmlOrXmlCoarseParse>();
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
#endif
|
|
@ -6,9 +6,7 @@ namespace DB
|
|||||||
{
|
{
|
||||||
|
|
||||||
class FunctionFactory;
|
class FunctionFactory;
|
||||||
#if USE_HYPERSCAN
|
|
||||||
void registerFunctionHtmlOrXmlCoarseParse(FunctionFactory &);
|
|
||||||
#endif
|
|
||||||
void registerFunctionRepeat(FunctionFactory &);
|
void registerFunctionRepeat(FunctionFactory &);
|
||||||
void registerFunctionEmpty(FunctionFactory &);
|
void registerFunctionEmpty(FunctionFactory &);
|
||||||
void registerFunctionNotEmpty(FunctionFactory &);
|
void registerFunctionNotEmpty(FunctionFactory &);
|
||||||
@ -35,8 +33,9 @@ void registerFunctionRegexpQuoteMeta(FunctionFactory &);
|
|||||||
void registerFunctionNormalizeQuery(FunctionFactory &);
|
void registerFunctionNormalizeQuery(FunctionFactory &);
|
||||||
void registerFunctionNormalizedQueryHash(FunctionFactory &);
|
void registerFunctionNormalizedQueryHash(FunctionFactory &);
|
||||||
void registerFunctionCountMatches(FunctionFactory &);
|
void registerFunctionCountMatches(FunctionFactory &);
|
||||||
void registerFunctionEncodeXMLComponent(FunctionFactory & factory);
|
void registerFunctionEncodeXMLComponent(FunctionFactory &);
|
||||||
void registerFunctionDecodeXMLComponent(FunctionFactory & factory);
|
void registerFunctionDecodeXMLComponent(FunctionFactory &);
|
||||||
|
void registerFunctionExtractTextFromHTML(FunctionFactory &);
|
||||||
|
|
||||||
|
|
||||||
#if USE_BASE64
|
#if USE_BASE64
|
||||||
@ -47,9 +46,6 @@ void registerFunctionTryBase64Decode(FunctionFactory &);
|
|||||||
|
|
||||||
void registerFunctionsString(FunctionFactory & factory)
|
void registerFunctionsString(FunctionFactory & factory)
|
||||||
{
|
{
|
||||||
#if USE_HYPERSCAN
|
|
||||||
registerFunctionHtmlOrXmlCoarseParse(factory);
|
|
||||||
#endif
|
|
||||||
registerFunctionRepeat(factory);
|
registerFunctionRepeat(factory);
|
||||||
registerFunctionEmpty(factory);
|
registerFunctionEmpty(factory);
|
||||||
registerFunctionNotEmpty(factory);
|
registerFunctionNotEmpty(factory);
|
||||||
@ -78,6 +74,7 @@ void registerFunctionsString(FunctionFactory & factory)
|
|||||||
registerFunctionCountMatches(factory);
|
registerFunctionCountMatches(factory);
|
||||||
registerFunctionEncodeXMLComponent(factory);
|
registerFunctionEncodeXMLComponent(factory);
|
||||||
registerFunctionDecodeXMLComponent(factory);
|
registerFunctionDecodeXMLComponent(factory);
|
||||||
|
registerFunctionExtractTextFromHTML(factory);
|
||||||
#if USE_BASE64
|
#if USE_BASE64
|
||||||
registerFunctionBase64Encode(factory);
|
registerFunctionBase64Encode(factory);
|
||||||
registerFunctionBase64Decode(factory);
|
registerFunctionBase64Decode(factory);
|
||||||
|
@ -246,6 +246,7 @@ SRCS(
|
|||||||
extractAllGroupsHorizontal.cpp
|
extractAllGroupsHorizontal.cpp
|
||||||
extractAllGroupsVertical.cpp
|
extractAllGroupsVertical.cpp
|
||||||
extractGroups.cpp
|
extractGroups.cpp
|
||||||
|
extractTextFromHTML.cpp
|
||||||
extractTimeZoneFromFunctionArguments.cpp
|
extractTimeZoneFromFunctionArguments.cpp
|
||||||
filesystem.cpp
|
filesystem.cpp
|
||||||
finalizeAggregation.cpp
|
finalizeAggregation.cpp
|
||||||
@ -291,7 +292,6 @@ SRCS(
|
|||||||
hasToken.cpp
|
hasToken.cpp
|
||||||
hasTokenCaseInsensitive.cpp
|
hasTokenCaseInsensitive.cpp
|
||||||
hostName.cpp
|
hostName.cpp
|
||||||
htmlOrXmlCoarseParse.cpp
|
|
||||||
hypot.cpp
|
hypot.cpp
|
||||||
identity.cpp
|
identity.cpp
|
||||||
if.cpp
|
if.cpp
|
||||||
|
@ -1,8 +1,9 @@
|
|||||||
SELECT htmlOrXmlCoarseParse('<script>Here is script.</script>');
|
SELECT extractTextFromHTML('<script>Here is script.</script>');
|
||||||
SELECT htmlOrXmlCoarseParse('<style>Here is style.</style>');
|
SELECT extractTextFromHTML('<style>Here is style.</style>');
|
||||||
SELECT htmlOrXmlCoarseParse('<![CDATA[Here is CDTATA.]]>');
|
SELECT extractTextFromHTML('<![CDATA[Here is CDTATA.]]>');
|
||||||
SELECT htmlOrXmlCoarseParse('This is a white space test.');
|
SELECT extractTextFromHTML('This is a white space test.');
|
||||||
SELECT htmlOrXmlCoarseParse('This is a complex test. <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"\n "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"><html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"><![CDATA[<script type="text/javascript">Hello, world</script> ]]><hello />world<![CDATA[ <style> ]]> hello</style>\n<script><![CDATA[</script>]]>hello</script>\n</html>');
|
SELECT extractTextFromHTML('This is a complex test. <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"\n "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"><html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"><![CDATA[<script type="text/javascript">Hello, world</script> ]]><hello />world<![CDATA[ <style> ]]> hello</style>\n<script><![CDATA[</script>]]>hello</script>\n</html>');
|
||||||
|
|
||||||
DROP TABLE IF EXISTS defaults;
|
DROP TABLE IF EXISTS defaults;
|
||||||
CREATE TABLE defaults
|
CREATE TABLE defaults
|
||||||
(
|
(
|
||||||
@ -11,5 +12,5 @@ CREATE TABLE defaults
|
|||||||
|
|
||||||
INSERT INTO defaults values ('<common tag>hello, world<tag>'), ('<script desc=content> some content </script>'), ('<![CDATA[hello, world]]>'), ('white space collapse');
|
INSERT INTO defaults values ('<common tag>hello, world<tag>'), ('<script desc=content> some content </script>'), ('<![CDATA[hello, world]]>'), ('white space collapse');
|
||||||
|
|
||||||
SELECT htmlOrXmlCoarseParse(stringColumn) FROM defaults;
|
SELECT extractTextFromHTML(stringColumn) FROM defaults;
|
||||||
DROP table defaults;
|
DROP table defaults;
|
||||||
|
120
tests/queries/0_stateless/01746_extract_text_from_html.reference
Normal file
120
tests/queries/0_stateless/01746_extract_text_from_html.reference
Normal file
@ -0,0 +1,120 @@
|
|||||||
|
-- { echo }
|
||||||
|
|
||||||
|
SELECT extractTextFromHTML('');
|
||||||
|
|
||||||
|
SELECT extractTextFromHTML(' ');
|
||||||
|
|
||||||
|
SELECT extractTextFromHTML(' ');
|
||||||
|
|
||||||
|
SELECT extractTextFromHTML('Hello');
|
||||||
|
Hello
|
||||||
|
SELECT extractTextFromHTML('Hello, world');
|
||||||
|
Hello, world
|
||||||
|
SELECT extractTextFromHTML('Hello, world');
|
||||||
|
Hello, world
|
||||||
|
SELECT extractTextFromHTML(' Hello, world');
|
||||||
|
Hello, world
|
||||||
|
SELECT extractTextFromHTML(' Hello, world ');
|
||||||
|
Hello, world
|
||||||
|
SELECT extractTextFromHTML(' \t Hello,\rworld \n ');
|
||||||
|
Hello, world
|
||||||
|
SELECT extractTextFromHTML('Hello<world');
|
||||||
|
Hello
|
||||||
|
SELECT extractTextFromHTML('Hello < world');
|
||||||
|
Hello
|
||||||
|
SELECT extractTextFromHTML('Hello > world');
|
||||||
|
Hello > world
|
||||||
|
SELECT extractTextFromHTML('Hello<world>');
|
||||||
|
Hello
|
||||||
|
SELECT extractTextFromHTML('Hello<>world');
|
||||||
|
Hello world
|
||||||
|
SELECT extractTextFromHTML('Hello<!>world');
|
||||||
|
Hello world
|
||||||
|
SELECT extractTextFromHTML('Hello<!->world');
|
||||||
|
Hello world
|
||||||
|
SELECT extractTextFromHTML('Hello<!-->world');
|
||||||
|
Hello world
|
||||||
|
SELECT extractTextFromHTML('Hello<!--->world');
|
||||||
|
Hello world
|
||||||
|
SELECT extractTextFromHTML('Hello<!---->world');
|
||||||
|
Hello world
|
||||||
|
SELECT extractTextFromHTML('Hello <!-- --> World');
|
||||||
|
Hello World
|
||||||
|
SELECT extractTextFromHTML('Hello<!-- --> World');
|
||||||
|
Hello World
|
||||||
|
SELECT extractTextFromHTML('Hello<!-- -->World');
|
||||||
|
Hello World
|
||||||
|
SELECT extractTextFromHTML('Hello <!-- -->World');
|
||||||
|
Hello World
|
||||||
|
SELECT extractTextFromHTML('Hello <u> World</u>');
|
||||||
|
Hello World
|
||||||
|
SELECT extractTextFromHTML('Hello <u>World</u>');
|
||||||
|
Hello World
|
||||||
|
SELECT extractTextFromHTML('Hello<u>World</u>');
|
||||||
|
Hello World
|
||||||
|
SELECT extractTextFromHTML('Hello<u> World</u>');
|
||||||
|
Hello World
|
||||||
|
SELECT extractTextFromHTML('<![CDATA[ \t Hello,\rworld \n ]]>');
|
||||||
|
\t Hello,\rworld \n
|
||||||
|
SELECT extractTextFromHTML('Hello <![CDATA[Hello\tworld]]> world!');
|
||||||
|
HelloHello\tworld world!
|
||||||
|
SELECT extractTextFromHTML('Hello<![CDATA[Hello\tworld]]>world!');
|
||||||
|
HelloHello\tworld world!
|
||||||
|
SELECT extractTextFromHTML('Hello <![CDATA[Hello <b>world</b>]]> world!');
|
||||||
|
HelloHello <b>world</b> world!
|
||||||
|
SELECT extractTextFromHTML('<![CDATA[<sender>John Smith</sender>]]>');
|
||||||
|
<sender>John Smith</sender>
|
||||||
|
SELECT extractTextFromHTML('<![CDATA[<sender>John <![CDATA[Smith</sender>]]>');
|
||||||
|
<sender>John <![CDATA[Smith</sender>
|
||||||
|
SELECT extractTextFromHTML('<![CDATA[<sender>John <![CDATA[]]>Smith</sender>]]>');
|
||||||
|
<sender>John <![CDATA[ Smith ]]>
|
||||||
|
SELECT extractTextFromHTML('<![CDATA[<sender>John ]]><![CDATA[Smith</sender>]]>');
|
||||||
|
<sender>John Smith</sender>
|
||||||
|
SELECT extractTextFromHTML('<![CDATA[<sender>John ]]> <![CDATA[Smith</sender>]]>');
|
||||||
|
<sender>John Smith</sender>
|
||||||
|
SELECT extractTextFromHTML('<![CDATA[<sender>John]]> <![CDATA[Smith</sender>]]>');
|
||||||
|
<sender>JohnSmith</sender>
|
||||||
|
SELECT extractTextFromHTML('<![CDATA[<sender>John ]]>]]><![CDATA[Smith</sender>]]>');
|
||||||
|
<sender>John ]]>Smith</sender>
|
||||||
|
SELECT extractTextFromHTML('Hello<script>World</script> goodbye');
|
||||||
|
Hello goodbye
|
||||||
|
SELECT extractTextFromHTML('Hello<script >World</script> goodbye');
|
||||||
|
Hello goodbye
|
||||||
|
SELECT extractTextFromHTML('Hello<scripta>World</scripta> goodbye');
|
||||||
|
Hello World goodbye
|
||||||
|
SELECT extractTextFromHTML('Hello<script type="text/javascript">World</script> goodbye');
|
||||||
|
Hello goodbye
|
||||||
|
SELECT extractTextFromHTML('Hello<style type="text/css">World</style> goodbye');
|
||||||
|
Hello goodbye
|
||||||
|
SELECT extractTextFromHTML('Hello<script:p>World</script:p> goodbye');
|
||||||
|
Hello World goodbye
|
||||||
|
SELECT extractTextFromHTML('Hello<script:p type="text/javascript">World</script:p> goodbye');
|
||||||
|
Hello World goodbye
|
||||||
|
SELECT extractTextFromHTML('Hello<style type="text/css">World <!-- abc --> </style> goodbye');
|
||||||
|
Hello goodbye
|
||||||
|
SELECT extractTextFromHTML('Hello<style type="text/css">World <!-- abc --> </style \n > goodbye');
|
||||||
|
Hello goodbye
|
||||||
|
SELECT extractTextFromHTML('Hello<style type="text/css">World <!-- abc --> </ style> goodbye');
|
||||||
|
Hello
|
||||||
|
SELECT extractTextFromHTML('Hello<style type="text/css">World <!-- abc --> </stylea> goodbye');
|
||||||
|
Hello
|
||||||
|
SELECT extractTextFromHTML('Hello<style type="text/css">World <![CDATA[</style>]]> </stylea> goodbye');
|
||||||
|
Hello
|
||||||
|
SELECT extractTextFromHTML('Hello<style type="text/css">World <![CDATA[</style>]]> </style> goodbye');
|
||||||
|
Hello goodbye
|
||||||
|
SELECT extractTextFromHTML('Hello<style type="text/css">World <![CDAT[</style>]]> </style> goodbye');
|
||||||
|
Hello ]]> goodbye
|
||||||
|
SELECT extractTextFromHTML('Hello<style type="text/css">World <![endif]--> </style> goodbye');
|
||||||
|
Hello goodbye
|
||||||
|
SELECT extractTextFromHTML('Hello<style type="text/css">World <script>abc</script> </stylea> goodbye');
|
||||||
|
Hello
|
||||||
|
SELECT extractTextFromHTML('Hello<style type="text/css">World <script>abc</script> </style> goodbye');
|
||||||
|
Hello goodbye
|
||||||
|
SELECT extractTextFromHTML('<![CDATA[]]]]><![CDATA[>]]>');
|
||||||
|
]]>
|
||||||
|
SELECT extractTextFromHTML('
|
||||||
|
<img src="pictures/power.png" style="margin-bottom: -30px;" />
|
||||||
|
<br><span style="padding-right: 10px; font-size: 10px;">xkcd.com</span>
|
||||||
|
</div>
|
||||||
|
');
|
||||||
|
xkcd.com
|
72
tests/queries/0_stateless/01746_extract_text_from_html.sql
Normal file
72
tests/queries/0_stateless/01746_extract_text_from_html.sql
Normal file
@ -0,0 +1,72 @@
|
|||||||
|
-- { echo }
|
||||||
|
|
||||||
|
SELECT extractTextFromHTML('');
|
||||||
|
SELECT extractTextFromHTML(' ');
|
||||||
|
SELECT extractTextFromHTML(' ');
|
||||||
|
SELECT extractTextFromHTML('Hello');
|
||||||
|
SELECT extractTextFromHTML('Hello, world');
|
||||||
|
SELECT extractTextFromHTML('Hello, world');
|
||||||
|
SELECT extractTextFromHTML(' Hello, world');
|
||||||
|
SELECT extractTextFromHTML(' Hello, world ');
|
||||||
|
SELECT extractTextFromHTML(' \t Hello,\rworld \n ');
|
||||||
|
|
||||||
|
SELECT extractTextFromHTML('Hello<world');
|
||||||
|
SELECT extractTextFromHTML('Hello < world');
|
||||||
|
SELECT extractTextFromHTML('Hello > world');
|
||||||
|
SELECT extractTextFromHTML('Hello<world>');
|
||||||
|
SELECT extractTextFromHTML('Hello<>world');
|
||||||
|
SELECT extractTextFromHTML('Hello<!>world');
|
||||||
|
SELECT extractTextFromHTML('Hello<!->world');
|
||||||
|
SELECT extractTextFromHTML('Hello<!-->world');
|
||||||
|
SELECT extractTextFromHTML('Hello<!--->world');
|
||||||
|
SELECT extractTextFromHTML('Hello<!---->world');
|
||||||
|
|
||||||
|
SELECT extractTextFromHTML('Hello <!-- --> World');
|
||||||
|
SELECT extractTextFromHTML('Hello<!-- --> World');
|
||||||
|
SELECT extractTextFromHTML('Hello<!-- -->World');
|
||||||
|
SELECT extractTextFromHTML('Hello <!-- -->World');
|
||||||
|
SELECT extractTextFromHTML('Hello <u> World</u>');
|
||||||
|
SELECT extractTextFromHTML('Hello <u>World</u>');
|
||||||
|
SELECT extractTextFromHTML('Hello<u>World</u>');
|
||||||
|
SELECT extractTextFromHTML('Hello<u> World</u>');
|
||||||
|
|
||||||
|
SELECT extractTextFromHTML('<![CDATA[ \t Hello,\rworld \n ]]>');
|
||||||
|
SELECT extractTextFromHTML('Hello <![CDATA[Hello\tworld]]> world!');
|
||||||
|
SELECT extractTextFromHTML('Hello<![CDATA[Hello\tworld]]>world!');
|
||||||
|
|
||||||
|
SELECT extractTextFromHTML('Hello <![CDATA[Hello <b>world</b>]]> world!');
|
||||||
|
SELECT extractTextFromHTML('<![CDATA[<sender>John Smith</sender>]]>');
|
||||||
|
SELECT extractTextFromHTML('<![CDATA[<sender>John <![CDATA[Smith</sender>]]>');
|
||||||
|
SELECT extractTextFromHTML('<![CDATA[<sender>John <![CDATA[]]>Smith</sender>]]>');
|
||||||
|
SELECT extractTextFromHTML('<![CDATA[<sender>John ]]><![CDATA[Smith</sender>]]>');
|
||||||
|
SELECT extractTextFromHTML('<![CDATA[<sender>John ]]> <![CDATA[Smith</sender>]]>');
|
||||||
|
SELECT extractTextFromHTML('<![CDATA[<sender>John]]> <![CDATA[Smith</sender>]]>');
|
||||||
|
SELECT extractTextFromHTML('<![CDATA[<sender>John ]]>]]><![CDATA[Smith</sender>]]>');
|
||||||
|
|
||||||
|
SELECT extractTextFromHTML('Hello<script>World</script> goodbye');
|
||||||
|
SELECT extractTextFromHTML('Hello<script >World</script> goodbye');
|
||||||
|
SELECT extractTextFromHTML('Hello<scripta>World</scripta> goodbye');
|
||||||
|
SELECT extractTextFromHTML('Hello<script type="text/javascript">World</script> goodbye');
|
||||||
|
SELECT extractTextFromHTML('Hello<style type="text/css">World</style> goodbye');
|
||||||
|
SELECT extractTextFromHTML('Hello<script:p>World</script:p> goodbye');
|
||||||
|
SELECT extractTextFromHTML('Hello<script:p type="text/javascript">World</script:p> goodbye');
|
||||||
|
|
||||||
|
SELECT extractTextFromHTML('Hello<style type="text/css">World <!-- abc --> </style> goodbye');
|
||||||
|
SELECT extractTextFromHTML('Hello<style type="text/css">World <!-- abc --> </style \n > goodbye');
|
||||||
|
SELECT extractTextFromHTML('Hello<style type="text/css">World <!-- abc --> </ style> goodbye');
|
||||||
|
SELECT extractTextFromHTML('Hello<style type="text/css">World <!-- abc --> </stylea> goodbye');
|
||||||
|
|
||||||
|
SELECT extractTextFromHTML('Hello<style type="text/css">World <![CDATA[</style>]]> </stylea> goodbye');
|
||||||
|
SELECT extractTextFromHTML('Hello<style type="text/css">World <![CDATA[</style>]]> </style> goodbye');
|
||||||
|
SELECT extractTextFromHTML('Hello<style type="text/css">World <![CDAT[</style>]]> </style> goodbye');
|
||||||
|
SELECT extractTextFromHTML('Hello<style type="text/css">World <![endif]--> </style> goodbye');
|
||||||
|
SELECT extractTextFromHTML('Hello<style type="text/css">World <script>abc</script> </stylea> goodbye');
|
||||||
|
SELECT extractTextFromHTML('Hello<style type="text/css">World <script>abc</script> </style> goodbye');
|
||||||
|
|
||||||
|
SELECT extractTextFromHTML('<![CDATA[]]]]><![CDATA[>]]>');
|
||||||
|
|
||||||
|
SELECT extractTextFromHTML('
|
||||||
|
<img src="pictures/power.png" style="margin-bottom: -30px;" />
|
||||||
|
<br><span style="padding-right: 10px; font-size: 10px;">xkcd.com</span>
|
||||||
|
</div>
|
||||||
|
');
|
Loading…
Reference in New Issue
Block a user