From 1d6a01dd6fd88ef02e89fe32347c299f4b077241 Mon Sep 17 00:00:00 2001 From: Nikolay Degterinsky Date: Sat, 5 Jun 2021 02:52:18 +0000 Subject: [PATCH] added tests --- tests/integration/test_nlp/__init__.py | 0 tests/integration/test_nlp/configs/config.xml | 48 ++++++++++++++++++ .../test_nlp/dictionaries/ext-en.txt | 4 ++ .../test_nlp/dictionaries/ext-ru.txt | 4 ++ tests/integration/test_nlp/test.py | 50 +++++++++++++++++++ .../0_stateless/01889_tokenize.reference | 8 +++ tests/queries/0_stateless/01889_tokenize.sql | 9 ++++ .../queries/0_stateless/01890_stem.reference | 21 ++++++++ tests/queries/0_stateless/01890_stem.sql | 23 +++++++++ 9 files changed, 167 insertions(+) create mode 100644 tests/integration/test_nlp/__init__.py create mode 100644 tests/integration/test_nlp/configs/config.xml create mode 100644 tests/integration/test_nlp/dictionaries/ext-en.txt create mode 100644 tests/integration/test_nlp/dictionaries/ext-ru.txt create mode 100644 tests/integration/test_nlp/test.py create mode 100644 tests/queries/0_stateless/01889_tokenize.reference create mode 100644 tests/queries/0_stateless/01889_tokenize.sql create mode 100644 tests/queries/0_stateless/01890_stem.reference create mode 100644 tests/queries/0_stateless/01890_stem.sql diff --git a/tests/integration/test_nlp/__init__.py b/tests/integration/test_nlp/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/integration/test_nlp/configs/config.xml b/tests/integration/test_nlp/configs/config.xml new file mode 100644 index 00000000000..4932feda600 --- /dev/null +++ b/tests/integration/test_nlp/configs/config.xml @@ -0,0 +1,48 @@ + + + + trace + /var/log/clickhouse-server/clickhouse-server.log + /var/log/clickhouse-server/clickhouse-server.err.log + 1000M + 10 + + + 9000 + 127.0.0.1 + + + + true + none + + AcceptCertificateHandler + + + + + 500 + 5368709120 + ./clickhouse/ + users.xml + + + + en + plain + /etc/clickhouse-server/dictionaries/ext-en.txt + + + ru + plain + /etc/clickhouse-server/dictionaries/ext-ru.txt + + + + + + en + /etc/clickhouse-server/dictionaries/lem-en.bin + + + diff --git a/tests/integration/test_nlp/dictionaries/ext-en.txt b/tests/integration/test_nlp/dictionaries/ext-en.txt new file mode 100644 index 00000000000..beb508e437d --- /dev/null +++ b/tests/integration/test_nlp/dictionaries/ext-en.txt @@ -0,0 +1,4 @@ +important big critical crucial essential +happy cheerful delighted ecstatic +however nonetheless but yet +quiz query check exam diff --git a/tests/integration/test_nlp/dictionaries/ext-ru.txt b/tests/integration/test_nlp/dictionaries/ext-ru.txt new file mode 100644 index 00000000000..5466354b264 --- /dev/null +++ b/tests/integration/test_nlp/dictionaries/ext-ru.txt @@ -0,0 +1,4 @@ +важный большой высокий хороший главный +веселый счастливый живой яркий смешной +хотя однако но правда +экзамен испытание проверка \ No newline at end of file diff --git a/tests/integration/test_nlp/test.py b/tests/integration/test_nlp/test.py new file mode 100644 index 00000000000..e5110722fe1 --- /dev/null +++ b/tests/integration/test_nlp/test.py @@ -0,0 +1,50 @@ +import os +import sys + +import pytest + +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__)) + +from helpers.cluster import ClickHouseCluster +from helpers.test_tools import assert_eq_with_retry + + +cluster = ClickHouseCluster(__file__) + +instance = cluster.add_instance('instance', main_configs=['configs/config.xml']) + +def copy_file_to_container(local_path, dist_path, container_id): + os.system("docker cp {local} {cont_id}:{dist}".format(local=local_path, cont_id=container_id, dist=dist_path)) + +@pytest.fixture(scope="module") +def start_cluster(): + try: + cluster.start() + + copy_file_to_container(os.path.join(SCRIPT_DIR, 'dictionaries/.'), '/etc/clickhouse-server/dictionaries', instance.docker_id) + + yield cluster + finally: + cluster.shutdown() + +def test_lemmatize(start_cluster): + assert instance.query("SELECT lemmatize('en', 'wolves')") == "wolf" + assert instance.query("SELECT lemmatize('en', 'dogs')") == "dog" + assert instance.query("SELECT lemmatize('en', 'looking')") == "look" + assert instance.query("SELECT lemmatize('en', 'took')") == "take" + assert instance.query("SELECT lemmatize('en', 'imported')") == "import" + assert instance.query("SELECT lemmatize('en', 'tokenized')") == "tokenize" + assert instance.query("SELECT lemmatize('en', 'flown')") == "fly" + +def test_synonyms_extensions(start_cluster): + assert instance.query("SELECT synonyms('en', 'crucial')") == "['important','big','critical','crucial','essential']" + assert instance.query("SELECT synonyms('en', 'cheerful')") == "['happy','cheerful','delighted','ecstatic']" + assert instance.query("SELECT synonyms('en', 'yet')") == "['however','nonetheless','but','yet']" + assert instance.query("SELECT synonyms('en', 'quiz')") == "['quiz','query','check','exam']" + + assert instance.query("SELECT synonyms('ru', 'главный')") == "['важный','большой','высокий','хороший','главный']" + assert instance.query("SELECT synonyms('ru', 'веселый')") == "['веселый','счастливый','живой','яркий','смешной]" + assert instance.query("SELECT synonyms('ru', 'правда')") == "['хотя','однако','но','правда']" + assert instance.query("SELECT synonyms('ru', 'экзамен')") == "['экзамен','испытание','проверка']" + diff --git a/tests/queries/0_stateless/01889_tokenize.reference b/tests/queries/0_stateless/01889_tokenize.reference new file mode 100644 index 00000000000..4dd6f323929 --- /dev/null +++ b/tests/queries/0_stateless/01889_tokenize.reference @@ -0,0 +1,8 @@ +['It','is','quite','a','wonderful','day','isn','t','it'] +['There','is','so','much','to','learn'] +['22','00','email','yandex','ru'] +['Токенизация','каких','либо','других','языков'] +['It','is','quite','a','wonderful','day,','isn\'t','it?'] +['There','is....','so','much','to','learn!'] +['22:00','email@yandex.ru'] +['Токенизация','каких-либо','других','языков?'] diff --git a/tests/queries/0_stateless/01889_tokenize.sql b/tests/queries/0_stateless/01889_tokenize.sql new file mode 100644 index 00000000000..2a5879e6c07 --- /dev/null +++ b/tests/queries/0_stateless/01889_tokenize.sql @@ -0,0 +1,9 @@ +SELECT tokenize('It is quite a wonderful day, isn\'t it?'); +SELECT tokenize('There is.... so much to learn!'); +SELECT tokenize('22:00 email@yandex.ru'); +SELECT tokenize('Токенизация каких-либо других языков?'); + +SELECT tokenizeWhitespace('It is quite a wonderful day, isn\'t it?'); +SELECT tokenizeWhitespace('There is.... so much to learn!'); +SELECT tokenizeWhitespace('22:00 email@yandex.ru'); +SELECT tokenizeWhitespace('Токенизация каких-либо других языков?'); diff --git a/tests/queries/0_stateless/01890_stem.reference b/tests/queries/0_stateless/01890_stem.reference new file mode 100644 index 00000000000..33e18cd6775 --- /dev/null +++ b/tests/queries/0_stateless/01890_stem.reference @@ -0,0 +1,21 @@ +given +combinatori +collect +possibl +studi +commonplac +pack +комбинаторн +получ +огранич +конечн +максимальн +суммарн +стоимост +remplissag +valeur +maximis +dépass +intens +étudi +peuvent diff --git a/tests/queries/0_stateless/01890_stem.sql b/tests/queries/0_stateless/01890_stem.sql new file mode 100644 index 00000000000..e0d6edd754d --- /dev/null +++ b/tests/queries/0_stateless/01890_stem.sql @@ -0,0 +1,23 @@ +SELECT stem('en', 'given'); +SELECT stem('en', 'combinatorial'); +SELECT stem('en', 'collection'); +SELECT stem('en', 'possibility'); +SELECT stem('en', 'studied'); +SELECT stem('en', 'commonplace'); +SELECT stem('en', 'packing'); + +SELECT stem('ru', 'комбинаторной'); +SELECT stem('ru', 'получила'); +SELECT stem('ru', 'ограничена'); +SELECT stem('ru', 'конечной'); +SELECT stem('ru', 'максимальной'); +SELECT stem('ru', 'суммарный'); +SELECT stem('ru', 'стоимостью'); + +SELECT stem('fr', 'remplissage'); +SELECT stem('fr', 'valeur'); +SELECT stem('fr', 'maximiser'); +SELECT stem('fr', 'dépasser'); +SELECT stem('fr', 'intensivement'); +SELECT stem('fr', 'étudié'); +SELECT stem('fr', 'peuvent'); \ No newline at end of file