mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-21 23:21:59 +00:00
Tests and documentation
This commit is contained in:
parent
0ab4afeeed
commit
4ab18cdcd8
@ -6,6 +6,58 @@
|
||||
#include <Common/StringUtils/StringUtils.h>
|
||||
|
||||
|
||||
/** A function to extract text from HTML or XHTML.
|
||||
* It does not necessarily 100% conforms to any of the HTML, XML or XHTML standards,
|
||||
* but the implementation is reasonably accurate and it is fast.
|
||||
*
|
||||
* The rules are the following:
|
||||
*
|
||||
* 1. Comments are skipped. Example: <!-- test -->
|
||||
* Comment must end with -->. Nested comments are not possible.
|
||||
* Note: constructions like <!--> <!---> are not valid comments in HTML but will be skipped by other rules.
|
||||
*
|
||||
* 2. CDATA is pasted verbatim.
|
||||
* Note: CDATA is XML/XHTML specific. But we still process it for "best-effort" approach.
|
||||
*
|
||||
* 3. 'script' and 'style' elements are removed with all their content.
|
||||
* Note: it's assumed that closing tag cannot appear inside content.
|
||||
* For example, in JS string literal is has to be escaped as "<\/script>".
|
||||
* Note: comments and CDATA is possible inside script or style - then closing tags are not searched inside CDATA.
|
||||
* Example: <script><![CDATA[</script>]]></script>
|
||||
* But still searched inside comments. Sometimes it becomes complicated:
|
||||
* <script>var x = "<!--"; </script> var y = "-->"; alert(x + y);</script>
|
||||
* Note: script and style can be the names of XML namespaces - then they are not treat like usual script or style.
|
||||
* Example: <script:a>Hello</script:a>.
|
||||
* Note: whitespaces are possible after closing tag name: </script > but not before: < / script>.
|
||||
*
|
||||
* 4. Other tags or tag-like elements are skipped without inner content.
|
||||
* Example: <a>.</a>
|
||||
* Note: it's expected that this HTML is illegal: <a test=">"></a>
|
||||
* Note: it will also skip something like tags: <>, <!>, etc.
|
||||
* Note: tag without end will be skipped to the end of input: <hello
|
||||
* >
|
||||
* 5. HTML and XML entities are not decoded.
|
||||
* It should be processed by separate function.
|
||||
*
|
||||
* 6. Whitespaces in text are collapsed or inserted by specific rules.
|
||||
* Whitespaces at beginning and at the end are removed.
|
||||
* Consecutive whitespaces are collapsed.
|
||||
* But if text is separated by other elements and there is no whitespace, it is inserted.
|
||||
* It may be unnatural, examples: Hello<b>world</b>, Hello<!-- -->world
|
||||
* - in HTML there will be no whitespace, but the function will insert it.
|
||||
* But also consider: Hello<p>world</p>, Hello<br>world.
|
||||
* This behaviour is reasonable for data analysis, e.g. convert HTML to a bag of words.
|
||||
*
|
||||
* 7. Also note that correct handling of whitespaces would require
|
||||
* support of <pre></pre> and CSS display and white-space properties.
|
||||
*
|
||||
* Usage example:
|
||||
*
|
||||
* SELECT extractTextFromHTML(html) FROM url('https://yandex.ru/', RawBLOB, 'html String')
|
||||
*
|
||||
* - ClickHouse has embedded web browser.
|
||||
*/
|
||||
|
||||
namespace DB
|
||||
{
|
||||
|
||||
@ -56,18 +108,11 @@ bool processComment(const char * __restrict & src, const char * end)
|
||||
return true;
|
||||
}
|
||||
|
||||
bool processCDATA(const char * __restrict & src, const char * end, char * __restrict & dst, bool & pending_whitespace)
|
||||
bool processCDATA(const char * __restrict & src, const char * end, char * __restrict & dst)
|
||||
{
|
||||
if (!checkAndSkip(src, end, "<![CDATA["))
|
||||
return false;
|
||||
|
||||
if (dst && pending_whitespace && src < end)
|
||||
{
|
||||
pending_whitespace = false;
|
||||
*dst = ' ';
|
||||
++dst;
|
||||
}
|
||||
|
||||
const char * gt = src;
|
||||
while (true)
|
||||
{
|
||||
@ -131,14 +176,14 @@ bool processElementAndSkipContent(const char * __restrict & src, const char * en
|
||||
|
||||
++src;
|
||||
|
||||
/// Skip comments and CDATA
|
||||
/// Skip CDATA
|
||||
if (*src == '!')
|
||||
{
|
||||
--src;
|
||||
bool pending_whitespace = false;
|
||||
char * dst = nullptr;
|
||||
processComment(src, end) || processCDATA(src, end, dst, pending_whitespace);
|
||||
continue;
|
||||
if (processCDATA(src, end, dst))
|
||||
continue;
|
||||
++src;
|
||||
}
|
||||
|
||||
if (*src != '/')
|
||||
@ -178,23 +223,19 @@ bool skipTag(const char * __restrict & src, const char * end)
|
||||
return false;
|
||||
}
|
||||
|
||||
void copyText(const char * __restrict & src, const char * end, char * __restrict & dst, bool & pending_whitespace)
|
||||
void copyText(const char * __restrict & src, const char * end, char * __restrict & dst, bool needs_whitespace)
|
||||
{
|
||||
while (src < end && isWhitespaceASCII(*src))
|
||||
{
|
||||
pending_whitespace = true;
|
||||
++src;
|
||||
}
|
||||
|
||||
if (pending_whitespace && src < end)
|
||||
const char * lt = find_first_symbols<'<'>(src, end);
|
||||
|
||||
if (needs_whitespace && src < lt)
|
||||
{
|
||||
pending_whitespace = false;
|
||||
*dst = ' ';
|
||||
++dst;
|
||||
}
|
||||
|
||||
const char * lt = find_first_symbols<'<'>(src, end);
|
||||
|
||||
while (true)
|
||||
{
|
||||
const char * ws = find_first_symbols<' ', '\t', '\n', '\r', '\f', '\v'>(src, lt);
|
||||
@ -204,10 +245,7 @@ void copyText(const char * __restrict & src, const char * end, char * __restrict
|
||||
|
||||
src = ws;
|
||||
while (src < lt && isWhitespaceASCII(*src))
|
||||
{
|
||||
pending_whitespace = true;
|
||||
++src;
|
||||
}
|
||||
|
||||
if (src < lt)
|
||||
{
|
||||
@ -232,16 +270,16 @@ size_t extract(const char * __restrict src, size_t size, char * __restrict dst)
|
||||
* - CDATA should be copied verbatim;
|
||||
*/
|
||||
|
||||
char * dst_begin = dst;
|
||||
const char * end = src + size;
|
||||
bool pending_whitespace = false;
|
||||
char * dst_begin = dst;
|
||||
|
||||
while (src < end)
|
||||
{
|
||||
copyText(src, end, dst, pending_whitespace);
|
||||
bool needs_whitespace = dst != dst_begin && dst[-1] != ' ';
|
||||
copyText(src, end, dst, needs_whitespace);
|
||||
|
||||
processComment(src, end)
|
||||
|| processCDATA(src, end, dst, pending_whitespace)
|
||||
|| processCDATA(src, end, dst)
|
||||
|| processElementAndSkipContent(src, end, "script")
|
||||
|| processElementAndSkipContent(src, end, "style")
|
||||
|| skipTag(src, end);
|
||||
|
Loading…
Reference in New Issue
Block a user