Tests and documentation

This commit is contained in:
Alexey Milovidov 2021-02-28 00:59:27 +03:00
parent 0ab4afeeed
commit 4ab18cdcd8

View File

@ -6,6 +6,58 @@
#include <Common/StringUtils/StringUtils.h>
/** A function to extract text from HTML or XHTML.
* It does not necessarily 100% conforms to any of the HTML, XML or XHTML standards,
* but the implementation is reasonably accurate and it is fast.
*
* The rules are the following:
*
* 1. Comments are skipped. Example: <!-- test -->
* Comment must end with -->. Nested comments are not possible.
* Note: constructions like <!--> <!---> are not valid comments in HTML but will be skipped by other rules.
*
* 2. CDATA is pasted verbatim.
* Note: CDATA is XML/XHTML specific. But we still process it for "best-effort" approach.
*
* 3. 'script' and 'style' elements are removed with all their content.
* Note: it's assumed that closing tag cannot appear inside content.
* For example, in JS string literal is has to be escaped as "<\/script>".
* Note: comments and CDATA is possible inside script or style - then closing tags are not searched inside CDATA.
* Example: <script><![CDATA[</script>]]></script>
* But still searched inside comments. Sometimes it becomes complicated:
* <script>var x = "<!--"; </script> var y = "-->"; alert(x + y);</script>
* Note: script and style can be the names of XML namespaces - then they are not treat like usual script or style.
* Example: <script:a>Hello</script:a>.
* Note: whitespaces are possible after closing tag name: </script > but not before: < / script>.
*
* 4. Other tags or tag-like elements are skipped without inner content.
* Example: <a>.</a>
* Note: it's expected that this HTML is illegal: <a test=">"></a>
* Note: it will also skip something like tags: <>, <!>, etc.
* Note: tag without end will be skipped to the end of input: <hello
* >
* 5. HTML and XML entities are not decoded.
* It should be processed by separate function.
*
* 6. Whitespaces in text are collapsed or inserted by specific rules.
* Whitespaces at beginning and at the end are removed.
* Consecutive whitespaces are collapsed.
* But if text is separated by other elements and there is no whitespace, it is inserted.
* It may be unnatural, examples: Hello<b>world</b>, Hello<!-- -->world
* - in HTML there will be no whitespace, but the function will insert it.
* But also consider: Hello<p>world</p>, Hello<br>world.
* This behaviour is reasonable for data analysis, e.g. convert HTML to a bag of words.
*
* 7. Also note that correct handling of whitespaces would require
* support of <pre></pre> and CSS display and white-space properties.
*
* Usage example:
*
* SELECT extractTextFromHTML(html) FROM url('https://yandex.ru/', RawBLOB, 'html String')
*
* - ClickHouse has embedded web browser.
*/
namespace DB
{
@ -56,18 +108,11 @@ bool processComment(const char * __restrict & src, const char * end)
return true;
}
bool processCDATA(const char * __restrict & src, const char * end, char * __restrict & dst, bool & pending_whitespace)
bool processCDATA(const char * __restrict & src, const char * end, char * __restrict & dst)
{
if (!checkAndSkip(src, end, "<![CDATA["))
return false;
if (dst && pending_whitespace && src < end)
{
pending_whitespace = false;
*dst = ' ';
++dst;
}
const char * gt = src;
while (true)
{
@ -131,14 +176,14 @@ bool processElementAndSkipContent(const char * __restrict & src, const char * en
++src;
/// Skip comments and CDATA
/// Skip CDATA
if (*src == '!')
{
--src;
bool pending_whitespace = false;
char * dst = nullptr;
processComment(src, end) || processCDATA(src, end, dst, pending_whitespace);
continue;
if (processCDATA(src, end, dst))
continue;
++src;
}
if (*src != '/')
@ -178,23 +223,19 @@ bool skipTag(const char * __restrict & src, const char * end)
return false;
}
void copyText(const char * __restrict & src, const char * end, char * __restrict & dst, bool & pending_whitespace)
void copyText(const char * __restrict & src, const char * end, char * __restrict & dst, bool needs_whitespace)
{
while (src < end && isWhitespaceASCII(*src))
{
pending_whitespace = true;
++src;
}
if (pending_whitespace && src < end)
const char * lt = find_first_symbols<'<'>(src, end);
if (needs_whitespace && src < lt)
{
pending_whitespace = false;
*dst = ' ';
++dst;
}
const char * lt = find_first_symbols<'<'>(src, end);
while (true)
{
const char * ws = find_first_symbols<' ', '\t', '\n', '\r', '\f', '\v'>(src, lt);
@ -204,10 +245,7 @@ void copyText(const char * __restrict & src, const char * end, char * __restrict
src = ws;
while (src < lt && isWhitespaceASCII(*src))
{
pending_whitespace = true;
++src;
}
if (src < lt)
{
@ -232,16 +270,16 @@ size_t extract(const char * __restrict src, size_t size, char * __restrict dst)
* - CDATA should be copied verbatim;
*/
char * dst_begin = dst;
const char * end = src + size;
bool pending_whitespace = false;
char * dst_begin = dst;
while (src < end)
{
copyText(src, end, dst, pending_whitespace);
bool needs_whitespace = dst != dst_begin && dst[-1] != ' ';
copyText(src, end, dst, needs_whitespace);
processComment(src, end)
|| processCDATA(src, end, dst, pending_whitespace)
|| processCDATA(src, end, dst)
|| processElementAndSkipContent(src, end, "script")
|| processElementAndSkipContent(src, end, "style")
|| skipTag(src, end);