mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-25 00:52:02 +00:00
Update logic and tests
This commit is contained in:
parent
fd1cf49e92
commit
2ac673b12a
@ -61,7 +61,7 @@ bool processCDATA(const char * __restrict & src, const char * end, char * __rest
|
|||||||
if (!checkAndSkip(src, end, "<![CDATA["))
|
if (!checkAndSkip(src, end, "<![CDATA["))
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
if (pending_whitespace && src < end)
|
if (dst && pending_whitespace && src < end)
|
||||||
{
|
{
|
||||||
pending_whitespace = false;
|
pending_whitespace = false;
|
||||||
*dst = ' ';
|
*dst = ' ';
|
||||||
@ -77,9 +77,12 @@ bool processCDATA(const char * __restrict & src, const char * end, char * __rest
|
|||||||
|
|
||||||
if (gt[-1] == ']' && gt[-2] == ']')
|
if (gt[-1] == ']' && gt[-2] == ']')
|
||||||
{
|
{
|
||||||
size_t bytes_to_copy = gt - src - strlen("]]");
|
if (dst)
|
||||||
memcpy(dst, src, bytes_to_copy);
|
{
|
||||||
dst += bytes_to_copy;
|
size_t bytes_to_copy = gt - src - strlen("]]");
|
||||||
|
memcpy(dst, src, bytes_to_copy);
|
||||||
|
dst += bytes_to_copy;
|
||||||
|
}
|
||||||
src = gt + 1;
|
src = gt + 1;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
@ -127,6 +130,17 @@ bool processElementAndSkipContent(const char * __restrict & src, const char * en
|
|||||||
break;
|
break;
|
||||||
|
|
||||||
++src;
|
++src;
|
||||||
|
|
||||||
|
/// Skip comments and CDATA
|
||||||
|
if (*src == '!')
|
||||||
|
{
|
||||||
|
--src;
|
||||||
|
bool pending_whitespace = false;
|
||||||
|
char * dst = nullptr;
|
||||||
|
processComment(src, end) || processCDATA(src, end, dst, pending_whitespace);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
if (*src != '/')
|
if (*src != '/')
|
||||||
continue;
|
continue;
|
||||||
++src;
|
++src;
|
||||||
|
@ -2,7 +2,7 @@
|
|||||||
|
|
||||||
Here is CDTATA.
|
Here is CDTATA.
|
||||||
This is a white space test.
|
This is a white space test.
|
||||||
This is a complex test. <script type="text/javascript">Hello, world</script> world <style> hello
|
This is a complex test. <script type="text/javascript">Hello, world</script> world <style> hello
|
||||||
hello, world
|
hello, world
|
||||||
|
|
||||||
hello, world
|
hello, world
|
||||||
|
@ -1,106 +1,53 @@
|
|||||||
-- { echo }
|
|
||||||
|
|
||||||
SELECT extractTextFromHTML('');
|
|
||||||
|
|
||||||
SELECT extractTextFromHTML(' ');
|
|
||||||
|
|
||||||
SELECT extractTextFromHTML(' ');
|
|
||||||
|
|
||||||
SELECT extractTextFromHTML('Hello');
|
|
||||||
Hello
|
Hello
|
||||||
SELECT extractTextFromHTML('Hello, world');
|
|
||||||
Hello, world
|
Hello, world
|
||||||
SELECT extractTextFromHTML('Hello, world');
|
|
||||||
Hello, world
|
Hello, world
|
||||||
SELECT extractTextFromHTML(' Hello, world');
|
|
||||||
Hello, world
|
Hello, world
|
||||||
SELECT extractTextFromHTML(' Hello, world ');
|
|
||||||
Hello, world
|
Hello, world
|
||||||
SELECT extractTextFromHTML(' \t Hello,\rworld \n ');
|
|
||||||
Hello, world
|
Hello, world
|
||||||
SELECT extractTextFromHTML('Hello<world');
|
|
||||||
Hello
|
Hello
|
||||||
SELECT extractTextFromHTML('Hello < world');
|
|
||||||
Hello
|
Hello
|
||||||
SELECT extractTextFromHTML('Hello > world');
|
|
||||||
Hello > world
|
Hello > world
|
||||||
SELECT extractTextFromHTML('Hello<world>');
|
|
||||||
Hello
|
Hello
|
||||||
SELECT extractTextFromHTML('Hello<>world');
|
|
||||||
Helloworld
|
Helloworld
|
||||||
SELECT extractTextFromHTML('Hello<!>world');
|
|
||||||
Helloworld
|
Helloworld
|
||||||
SELECT extractTextFromHTML('Hello<!->world');
|
|
||||||
Helloworld
|
Helloworld
|
||||||
SELECT extractTextFromHTML('Hello<!-->world');
|
|
||||||
Helloworld
|
Helloworld
|
||||||
SELECT extractTextFromHTML('Hello<!--->world');
|
|
||||||
Helloworld
|
Helloworld
|
||||||
SELECT extractTextFromHTML('Hello<!---->world');
|
|
||||||
Helloworld
|
Helloworld
|
||||||
SELECT extractTextFromHTML('Hello <!-- --> World');
|
|
||||||
Hello World
|
Hello World
|
||||||
SELECT extractTextFromHTML('Hello<!-- --> World');
|
|
||||||
Hello World
|
Hello World
|
||||||
SELECT extractTextFromHTML('Hello<!-- -->World');
|
|
||||||
HelloWorld
|
HelloWorld
|
||||||
SELECT extractTextFromHTML('Hello <!-- -->World');
|
|
||||||
Hello World
|
Hello World
|
||||||
SELECT extractTextFromHTML('Hello <u> World</u>');
|
|
||||||
Hello World
|
Hello World
|
||||||
SELECT extractTextFromHTML('Hello <u>World</u>');
|
|
||||||
Hello World
|
Hello World
|
||||||
SELECT extractTextFromHTML('Hello<u>World</u>');
|
|
||||||
HelloWorld
|
HelloWorld
|
||||||
SELECT extractTextFromHTML('Hello<u> World</u>');
|
|
||||||
Hello World
|
Hello World
|
||||||
SELECT extractTextFromHTML('<![CDATA[ \t Hello,\rworld \n ]]>');
|
|
||||||
\t Hello,\rworld \n
|
\t Hello,\rworld \n
|
||||||
SELECT extractTextFromHTML('Hello <![CDATA[Hello\tworld]]> world!');
|
|
||||||
Hello Hello\tworld world!
|
Hello Hello\tworld world!
|
||||||
SELECT extractTextFromHTML('Hello<![CDATA[Hello\tworld]]>world!');
|
|
||||||
HelloHello\tworldworld!
|
HelloHello\tworldworld!
|
||||||
SELECT extractTextFromHTML('Hello <![CDATA[Hello <b>world</b>]]> world!');
|
|
||||||
Hello Hello <b>world</b> world!
|
Hello Hello <b>world</b> world!
|
||||||
SELECT extractTextFromHTML('<![CDATA[<sender>John Smith</sender>]]>');
|
|
||||||
<sender>John Smith</sender>
|
<sender>John Smith</sender>
|
||||||
SELECT extractTextFromHTML('<![CDATA[<sender>John <![CDATA[Smith</sender>]]>');
|
|
||||||
<sender>John <![CDATA[Smith</sender>
|
<sender>John <![CDATA[Smith</sender>
|
||||||
SELECT extractTextFromHTML('<![CDATA[<sender>John <![CDATA[]]>Smith</sender>]]>');
|
|
||||||
<sender>John <![CDATA[Smith]]>
|
<sender>John <![CDATA[Smith]]>
|
||||||
SELECT extractTextFromHTML('<![CDATA[<sender>John ]]><![CDATA[Smith</sender>]]>');
|
|
||||||
<sender>John Smith</sender>
|
<sender>John Smith</sender>
|
||||||
SELECT extractTextFromHTML('<![CDATA[<sender>John ]]> <![CDATA[Smith</sender>]]>');
|
|
||||||
<sender>John Smith</sender>
|
<sender>John Smith</sender>
|
||||||
SELECT extractTextFromHTML('<![CDATA[<sender>John]]> <![CDATA[Smith</sender>]]>');
|
|
||||||
<sender>John Smith</sender>
|
<sender>John Smith</sender>
|
||||||
SELECT extractTextFromHTML('<![CDATA[<sender>John ]]>]]><![CDATA[Smith</sender>]]>');
|
|
||||||
<sender>John ]]>Smith</sender>
|
<sender>John ]]>Smith</sender>
|
||||||
SELECT extractTextFromHTML('Hello<script>World</script> goodbye');
|
|
||||||
Hello goodbye
|
Hello goodbye
|
||||||
SELECT extractTextFromHTML('Hello<script >World</script> goodbye');
|
|
||||||
Hello goodbye
|
Hello goodbye
|
||||||
SELECT extractTextFromHTML('Hello<scripta>World</scripta> goodbye');
|
|
||||||
HelloWorld goodbye
|
HelloWorld goodbye
|
||||||
SELECT extractTextFromHTML('Hello<script type="text/javascript">World</script> goodbye');
|
|
||||||
Hello goodbye
|
Hello goodbye
|
||||||
SELECT extractTextFromHTML('Hello<style type="text/css">World</style> goodbye');
|
|
||||||
Hello goodbye
|
Hello goodbye
|
||||||
SELECT extractTextFromHTML('Hello<script:p>World</script:p> goodbye');
|
|
||||||
HelloWorld goodbye
|
HelloWorld goodbye
|
||||||
SELECT extractTextFromHTML('Hello<script:p type="text/javascript">World</script:p> goodbye');
|
|
||||||
HelloWorld goodbye
|
HelloWorld goodbye
|
||||||
SELECT extractTextFromHTML('Hello<style type="text/css">World <!-- abc --> </style> goodbye');
|
|
||||||
Hello goodbye
|
Hello goodbye
|
||||||
SELECT extractTextFromHTML('Hello<style type="text/css">World <!-- abc --> </style \n > goodbye');
|
|
||||||
Hello goodbye
|
Hello goodbye
|
||||||
SELECT extractTextFromHTML('Hello<style type="text/css">World <!-- abc --> </ style> goodbye');
|
|
||||||
Hello
|
Hello
|
||||||
SELECT extractTextFromHTML('Hello<style type="text/css">World <!-- abc --> </stylea> goodbye');
|
|
||||||
Hello
|
Hello
|
||||||
SELECT extractTextFromHTML('Hello<style type="text/css">World <![CDATA[</style>]]> </stylea> goodbye');
|
|
||||||
Hello]]> goodbye
|
|
||||||
SELECT extractTextFromHTML('Hello<style type="text/css">World <script>abc</script> </stylea> goodbye');
|
|
||||||
Hello
|
Hello
|
||||||
SELECT extractTextFromHTML('Hello<style type="text/css">World <script>abc</script> </style> goodbye');
|
Hello goodbye
|
||||||
|
Hello
|
||||||
Hello goodbye
|
Hello goodbye
|
||||||
|
@ -57,5 +57,6 @@ SELECT extractTextFromHTML('Hello<style type="text/css">World <!-- abc --> </ st
|
|||||||
SELECT extractTextFromHTML('Hello<style type="text/css">World <!-- abc --> </stylea> goodbye');
|
SELECT extractTextFromHTML('Hello<style type="text/css">World <!-- abc --> </stylea> goodbye');
|
||||||
|
|
||||||
SELECT extractTextFromHTML('Hello<style type="text/css">World <![CDATA[</style>]]> </stylea> goodbye');
|
SELECT extractTextFromHTML('Hello<style type="text/css">World <![CDATA[</style>]]> </stylea> goodbye');
|
||||||
|
SELECT extractTextFromHTML('Hello<style type="text/css">World <![CDATA[</style>]]> </style> goodbye');
|
||||||
SELECT extractTextFromHTML('Hello<style type="text/css">World <script>abc</script> </stylea> goodbye');
|
SELECT extractTextFromHTML('Hello<style type="text/css">World <script>abc</script> </stylea> goodbye');
|
||||||
SELECT extractTextFromHTML('Hello<style type="text/css">World <script>abc</script> </style> goodbye');
|
SELECT extractTextFromHTML('Hello<style type="text/css">World <script>abc</script> </style> goodbye');
|
||||||
|
Loading…
Reference in New Issue
Block a user