Merge branch 'master' into MDB-15474

This commit is contained in:
ianton-ru 2021-12-30 12:55:26 +03:00 committed by GitHub
commit 92cb451d0f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
156 changed files with 5558 additions and 1044 deletions

View File

@ -1,14 +1,16 @@
option (ENABLE_AZURE_BLOB_STORAGE "Enable Azure blob storage" ${ENABLE_LIBRARIES})
option(USE_INTERNAL_AZURE_BLOB_STORAGE_LIBRARY
"Set to FALSE to use system Azure SDK instead of bundled (OFF currently not implemented)"
ON)
if (ENABLE_AZURE_BLOB_STORAGE)
set(USE_AZURE_BLOB_STORAGE 1)
set(AZURE_BLOB_STORAGE_LIBRARY azure_sdk)
else()
return()
endif()
option(USE_INTERNAL_AZURE_BLOB_STORAGE_LIBRARY
"Set to FALSE to use system Azure SDK instead of bundled (OFF currently not implemented)"
ON)
if ((NOT EXISTS "${ClickHouse_SOURCE_DIR}/contrib/azure/sdk"
OR NOT EXISTS "${ClickHouse_SOURCE_DIR}/contrib/azure/cmake-modules")
AND USE_INTERNAL_AZURE_BLOB_STORAGE_LIBRARY)

View File

@ -31,6 +31,7 @@ if (CCACHE_FOUND AND NOT COMPILER_MATCHES_CCACHE)
if (CCACHE_VERSION VERSION_GREATER "3.2.0" OR NOT CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
message(STATUS "Using ${CCACHE_FOUND} ${CCACHE_VERSION}")
set(LAUNCHER ${CCACHE_FOUND})
# debian (debhelpers) set SOURCE_DATE_EPOCH environment variable, that is
# filled from the debian/changelog or current time.
@ -39,13 +40,8 @@ if (CCACHE_FOUND AND NOT COMPILER_MATCHES_CCACHE)
# of the manifest, which do not allow to use previous cache,
# - 4.2+ ccache ignores SOURCE_DATE_EPOCH for every file w/o __DATE__/__TIME__
#
# So for:
# - 4.2+ does not require any sloppiness
# - 4.0+ will ignore SOURCE_DATE_EPOCH environment variable.
if (CCACHE_VERSION VERSION_GREATER_EQUAL "4.2")
message(STATUS "ccache is 4.2+ no quirks for SOURCE_DATE_EPOCH required")
set(LAUNCHER ${CCACHE_FOUND})
elseif (CCACHE_VERSION VERSION_GREATER_EQUAL "4.0")
# Exclude SOURCE_DATE_EPOCH env for ccache versions between [4.0, 4.2).
if (CCACHE_VERSION VERSION_GREATER_EQUAL "4.0" AND CCACHE_VERSION VERSION_LESS "4.2")
message(STATUS "Ignore SOURCE_DATE_EPOCH for ccache")
set(LAUNCHER env -u SOURCE_DATE_EPOCH ${CCACHE_FOUND})
endif()

View File

@ -186,7 +186,7 @@ $ echo "SELECT 1" | gzip -c | \
```
``` bash
# Receiving compressed data from the server
# Receiving compressed data archive from the server
$ curl -vsS "http://localhost:8123/?enable_http_compression=1" \
-H 'Accept-Encoding: gzip' --output result.gz -d 'SELECT number FROM system.numbers LIMIT 3'
$ zcat result.gz
@ -195,6 +195,15 @@ $ zcat result.gz
2
```
```bash
# Receiving compressed data from the server and using the gunzip to receive decompressed data
$ curl -sS "http://localhost:8123/?enable_http_compression=1" \
-H 'Accept-Encoding: gzip' -d 'SELECT number FROM system.numbers LIMIT 3' | gunzip -
0
1
2
```
## Default Database {#default-database}
You can use the database URL parameter or the X-ClickHouse-Database header to specify the default database.

View File

@ -60,8 +60,10 @@ toc_title: Adopters
| <a href="https://www.exness.com/" class="favicon">Exness</a> | Trading | Metrics, Logging | — | — | [Talk in Russian, May 2019](https://youtu.be/_rpU-TvSfZ8?t=3215) |
| <a href="https://www.eventbunker.io/" class="favicon">EventBunker.io</a> | Serverless Data Processing | — | — | — | [Tweet, April 2021](https://twitter.com/Halil_D_/status/1379839133472985091) |
| <a href="https://fastnetmon.com/" class="favicon">FastNetMon</a> | DDoS Protection | Main Product | | — | [Official website](https://fastnetmon.com/docs-fnm-advanced/fastnetmon-advanced-traffic-persistency/) |
| <a href="https://www.firebolt.io/" class="favicon">Firebolt</a> | Analytics | Main product | - | - | [YouTube Tech Talk](https://www.youtube.com/watch?v=9rW9uEJ15tU) |
| <a href="https://www.flipkart.com/" class="favicon">Flipkart</a> | e-Commerce | — | — | — | [Talk in English, July 2020](https://youtu.be/GMiXCMFDMow?t=239) |
| <a href="https://fun.co/rp" class="favicon">FunCorp</a> | Games | | — | 14 bn records/day as of Jan 2021 | [Article](https://www.altinity.com/blog/migrating-from-redshift-to-clickhouse) |
| <a href="https://futurragroup.com/" class="favicon">Futurra Group</a> | Analytics | — | — | — | [Article in Russian, December 2021](https://dou.ua/forums/topic/35587/) |
| <a href="https://geniee.co.jp" class="favicon">Geniee</a> | Ad network | Main product | — | — | [Blog post in Japanese, July 2017](https://tech.geniee.co.jp/entry/2017/07/20/160100) |
| <a href="https://www.genotek.ru/" class="favicon">Genotek</a> | Bioinformatics | Main product | — | — | [Video, August 2020](https://youtu.be/v3KyZbz9lEE) |
| <a href="https://gigapipe.com/" class="favicon">Gigapipe</a> | Managed ClickHouse | Main product | — | — | [Official website](https://gigapipe.com/) |
@ -70,6 +72,7 @@ toc_title: Adopters
| <a href="https://www.grouparoo.com" class="favicon">Grouparoo</a> | Data Warehouse Integrations | Main product | — | — | [Official Website, November 2021](https://www.grouparoo.com/integrations) |
| <a href="https://www.huya.com/" class="favicon">HUYA</a> | Video Streaming | Analytics | — | — | [Slides in Chinese, October 2018](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup19/7.%20ClickHouse万亿数据分析实践%20李本旺(sundy-li)%20虎牙.pdf) |
| <a href="https://www.hydrolix.io/" class="favicon">Hydrolix</a> | Cloud data platform | Main product | — | — | [Documentation](https://docs.hydrolix.io/guide/query) |
| <a href="https://hystax.com" class="favicon">Hystax</a> | Cloud Operations | Observability Analytics | - | - | [Blog](https://hystax.com/clickhouse-for-real-time-cost-saving-analytics-how-to-stop-hammering-screws-and-use-an-electric-screwdriver/) |
| <a href="https://www.the-ica.com/" class="favicon">ICA</a> | FinTech | Risk Management | — | — | [Blog Post in English, Sep 2020](https://altinity.com/blog/clickhouse-vs-redshift-performance-for-fintech-risk-management?utm_campaign=ClickHouse%20vs%20RedShift&utm_content=143520807&utm_medium=social&utm_source=twitter&hss_channel=tw-3894792263) |
| <a href="https://www.idealista.com" class="favicon">Idealista</a> | Real Estate | Analytics | — | — | [Blog Post in English, April 2019](https://clickhouse.com/blog/en/clickhouse-meetup-in-madrid-on-april-2-2019) |
| <a href="https://infobaleen.com" class="favicon">Infobaleen</a> | AI markting tool | Analytics | — | — | [Official site](https://infobaleen.com) |
@ -81,14 +84,18 @@ toc_title: Adopters
| <a href="https://ippon.tech" class="favicon">Ippon Technologies</a> | Technology Consulting | — | — | — | [Talk in English, July 2020](https://youtu.be/GMiXCMFDMow?t=205) |
| <a href="https://www.ivi.ru/" class="favicon">Ivi</a> | Online Cinema | Analytics, Monitoring | — | — | [Article in Russian, Jan 2018](https://habr.com/en/company/ivi/blog/347408/) |
| <a href="https://jinshuju.net" class="favicon">Jinshuju 金数据</a> | BI Analytics | Main product | — | — | [Slides in Chinese, October 2019](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup24/3.%20金数据数据架构调整方案Public.pdf) |
| <a href="https://jitsu.com" class="favicon">Jitsu</a> | Cloud Software | Data Pipeline | — | — | [Documentation](https://jitsu.com/docs/destinations-configuration/clickhouse-destination), [Hacker News](https://news.ycombinator.com/item?id=29106082) |
| <a href="https://jitsu.com" class="favicon">Jitsu</a> | Cloud Software | Data Pipeline | — | — | [Documentation](https://jitsu.com/docs/destinations-configuration/clickhouse-destination), [Hacker News post](https://news.ycombinator.com/item?id=29106082) |
| <a href="https://juicefs.com/" class="favicon">JuiceFS</a> | Storage | Shopping Cart | - | - | [Blog](https://juicefs.com/blog/en/posts/shopee-clickhouse-with-juicefs/) |
| <a href="https://www.kakaocorp.com/" class="favicon">kakaocorp</a> | Internet company | — | — | — | [if(kakao)2020](https://tv.kakao.com/channel/3693125/cliplink/414129353), [if(kakao)2021](https://if.kakao.com/session/24) |
| <a href="https://www.kodiakdata.com/" class="favicon">Kodiak Data</a> | Clouds | Main product | — | — | [Slides in Engish, April 2018](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup13/kodiak_data.pdf) |
| <a href="https://kontur.ru" class="favicon">Kontur</a> | Software Development | Metrics | — | — | [Talk in Russian, November 2018](https://www.youtube.com/watch?v=U4u4Bd0FtrY) |
| <a href="https://www.kuaishou.com/" class="favicon">Kuaishou</a> | Video | — | — | — | [ClickHouse Meetup, October 2018](https://clickhouse.com/blog/en/2018/clickhouse-community-meetup-in-beijing-on-october-28-2018/) |
| <a href="https://www.kgk-global.com/en/" class="favicon">KGK Global</a> | Vehicle monitoring | — | — | — | [Press release, June 2021](https://zoom.cnews.ru/news/item/530921) |
| <a href="https://www.lancom-systems.com/" class="favicon">LANCOM Systems</a> | Network Solutions | Traffic analysis | - | - | [ClickHouse Operator for Kubernetes](https://www.lancom-systems.com/), [Hacker News post] (https://news.ycombinator.com/item?id=29413660) |
| <a href="https://www.lbl.gov" class="favicon">Lawrence Berkeley National Laboratory</a> | Research | Traffic analysis | 5 servers | 55 TiB | [Slides in English, April 2019](https://www.smitasin.com/presentations/2019-04-17_DOE-NSM.pdf) |
| <a href="https://www.lever.co/" class="favicon">Lever</a> | Talent Management | Recruiting | - | - | [Hacker News post](https://news.ycombinator.com/item?id=29558544) |
| <a href="https://lifestreet.com/" class="favicon">LifeStreet</a> | Ad network | Main product | 75 servers (3 replicas) | 5.27 PiB | [Blog post in Russian, February 2017](https://habr.com/en/post/322620/) |
| <a href="https://lookforsale.ru/" class="favicon">Lookforsale</a> | E-Commerce | — | — | — | [Job Posting, December 2021](https://telegram.me/javascript_jobs/587318) |
| <a href="https://mcs.mail.ru/" class="favicon">Mail.ru Cloud Solutions</a> | Cloud services | Main product | — | — | [Article in Russian](https://mcs.mail.ru/help/db-create/clickhouse#) |
| <a href="https://maxilect.com/" class="favicon">MAXILECT</a> | Ad Tech, Blockchain, ML, AI | — | — | — | [Job advertisement, 2021](https://www.linkedin.com/feed/update/urn:li:activity:6780842017229430784/) |
| <a href="https://tech.mymarilyn.ru" class="favicon">Marilyn</a> | Advertising | Statistics | — | — | [Talk in Russian, June 2017](https://www.youtube.com/watch?v=iXlIgx2khwc) |
@ -106,6 +113,7 @@ toc_title: Adopters
| <a href="https://ok.ru" class="favicon">Ok.ru</a> | Social Network | — | 72 servers | 810 TB compressed, 50bn rows/day, 1.5 TB/day | [SmartData conference, October 2021](https://assets.ctfassets.net/oxjq45e8ilak/4JPHkbJenLgZhBGGyyonFP/57472ec6987003ec4078d0941740703b/____________________ClickHouse_______________________.pdf) |
| <a href="https://omnicomm.ru/" class="favicon">Omnicomm</a> | Transportation Monitoring | — | — | — | [Facebook post, October 2021](https://www.facebook.com/OmnicommTeam/posts/2824479777774500) |
| <a href="https://www.oneapm.com/" class="favicon">OneAPM</a> | Monitoring and Data Analysis | Main product | — | — | [Slides in Chinese, October 2018](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup19/8.%20clickhouse在OneAPM的应用%20杜龙.pdf) |
| <a href="https://opensee.io/" class="favicon">Opensee</a> | Financial Analytics | Main product | - | - | [Blog](https://opensee.io/news/from-moscow-to-wall-street-the-remarkable-journey-of-clickhouse/) |
| <a href="https://www.opentargets.org/" class="favicon">Open Targets</a> | Genome Research | Genome Search | — | — | [Tweet, October 2021](https://twitter.com/OpenTargets/status/1452570865342758913?s=20), [Blog](https://blog.opentargets.org/graphql/) |
| <a href="https://corp.ozon.com/" class="favicon">OZON</a> | E-commerce | — | — | — | [Official website](https://job.ozon.ru/vacancy/razrabotchik-clickhouse-ekspluatatsiya-40991870/) |
| <a href="https://panelbear.com/" class="favicon">Panelbear | Analytics | Monitoring and Analytics | — | — | [Tech Stack, November 2020](https://panelbear.com/blog/tech-stack/) |
@ -118,6 +126,7 @@ toc_title: Adopters
| <a href="https://prana-system.com/en/" class="favicon">PRANA</a> | Industrial predictive analytics | Main product | — | — | [News (russian), Feb 2021](https://habr.com/en/news/t/541392/) |
| <a href="https://www.qingcloud.com/" class="favicon">QINGCLOUD</a> | Cloud services | Main product | — | — | [Slides in Chinese, October 2018](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup19/4.%20Cloud%20%2B%20TSDB%20for%20ClickHouse%20张健%20QingCloud.pdf) |
| <a href="https://qrator.net" class="favicon">Qrator</a> | DDoS protection | Main product | — | — | [Blog Post, March 2019](https://blog.qrator.net/en/clickhouse-ddos-mitigation_37/) |
| <a href="https://rvision.pro/en/" class="favicon">R-Vision</a> | Information Security | — | — | — | [Article in Russian, December 2021](https://www.anti-malware.ru/reviews/R-Vision-SENSE-15) |
| <a href="https://www.rbinternational.com/" class="favicon">Raiffeisenbank</a> | Banking | Analytics | — | — | [Lecture in Russian, December 2020](https://cs.hse.ru/announcements/421965599.html) |
| <a href="https://rambler.ru" class="favicon">Rambler</a> | Internet services | Analytics | — | — | [Talk in Russian, April 2018](https://medium.com/@ramblertop/разработка-api-clickhouse-для-рамблер-топ-100-f4c7e56f3141) |
| <a href="https://replicahq.com" class="favicon">Replica</a> | Urban Planning | Analytics | — | — | [Job advertisement](https://boards.greenhouse.io/replica/jobs/5547732002?gh_jid=5547732002) |
@ -153,6 +162,7 @@ toc_title: Adopters
| <a href="https://www.tinybird.co/" class="favicon">Tinybird</a> | Real-time Data Products | Data processing | — | — | [Official website](https://www.tinybird.co/) |
| <a href="https://trafficstars.com/" class="favicon">Traffic Stars</a> | AD network | — | 300 servers in Europe/US | 1.8 PiB, 700 000 insert rps (as of 2021) | [Slides in Russian, May 2018](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup15/lightning/ninja.pdf) |
| <a href="https://www.uber.com" class="favicon">Uber</a> | Taxi | Logging | — | — | [Slides, February 2020](https://presentations.clickhouse.com/meetup40/uber.pdf) |
| <a href="https://usetech.com/" class="favicon">UseTech</a> | Software Development | — | — | — | [Job Posting, December 2021](https://vk.com/wall136266658_2418) |
| <a href="https://hello.utmstat.com/" class="favicon">UTMSTAT</a> | Analytics | Main product | — | — | [Blog post, June 2020](https://vc.ru/tribuna/133956-striming-dannyh-iz-servisa-skvoznoy-analitiki-v-clickhouse) |
| <a href="https://vercel.com/" class="favicon">Vercel</a> | Traffic and Performance Analytics | — | — | — | Direct reference, October 2021 |
| <a href="https://vk.com" class="favicon">VKontakte</a> | Social Network | Statistics, Logging | — | — | [Slides in Russian, August 2018](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup17/3_vk.pdf) |
@ -168,7 +178,8 @@ toc_title: Adopters
| <a href="https://cloud.yandex.ru/services/managed-clickhouse" class="favicon">Yandex Cloud</a> | Public Cloud | Main product | — | — | [Talk in Russian, December 2019](https://www.youtube.com/watch?v=pgnak9e_E0o) |
| <a href="https://cloud.yandex.ru/services/datalens" class="favicon">Yandex DataLens</a> | Business Intelligence | Main product | — | — | [Slides in Russian, December 2019](https://presentations.clickhouse.com/meetup38/datalens.pdf) |
| <a href="https://market.yandex.ru/" class="favicon">Yandex Market</a> | e-Commerce | Metrics, Logging | — | — | [Talk in Russian, January 2019](https://youtu.be/_l1qP0DyBcA?t=478) |
| <a href="https://metrica.yandex.com" class="favicon">Yandex Metrica</a> | Web analytics | Main product | 630 servers in one cluster, 360 servers in another cluster, 1862 servers in one department | 133 PiB / 8.31 PiB / 120 trillion records | [Slides, February 2020](https://presentations.clickhouse.com/meetup40/introduction/#13) |
| <a href="https://metrica.yandex.com" class="favicon">Yandex Metrica</a> | Web analytics | Macin product | 630 servers in one cluster, 360 servers in another cluster, 1862 servers in one department | 133 PiB / 8.31 PiB / 120 trillion records | [Slides, February 2020](https://presentations.clickhouse.com/meetup40/introduction/#13) |
| <a href="https://www.yellowfinbi.com" class="favicon"><COMPANYNAME></a> | Analytics | Main product | - | - | [Integration](https://www.yellowfinbi.com/campaign/yellowfin-9-whats-new#el-30219e0e) |
| <a href="https://www.yotascale.com/" class="favicon">Yotascale</a> | Cloud | Data pipeline | — | 2 bn records/day | [LinkedIn (Accomplishments)](https://www.linkedin.com/in/adilsaleem/) |
| <a href="https://www.your-analytics.org/" class="favicon">Your Analytics</a> | Product Analytics | Main Product | — | - | [Tweet, November 2021](https://twitter.com/mikenikles/status/1459737241165565953) |
| <a href="https://zagravagames.com/en/" class="favicon">Zagrava Trading</a> | — | — | — | — | [Job offer, May 2021](https://twitter.com/datastackjobs/status/1394707267082063874) |
@ -178,9 +189,5 @@ toc_title: Adopters
| <a href="https://promo.croc.ru/digitalworker" class="favicon">Цифровой Рабочий</a> | Industrial IoT, Analytics | — | — | — | [Blog post in Russian, March 2021](https://habr.com/en/company/croc/blog/548018/) |
| <a href="https://shop.okraina.ru/" class="favicon">ООО «МПЗ Богородский»</a> | Agriculture | — | — | — | [Article in Russian, November 2020](https://cloud.yandex.ru/cases/okraina) |
| <a href="https://domclick.ru/" class="favicon">ДомКлик</a> | Real Estate | — | — | — | [Article in Russian, October 2021](https://habr.com/ru/company/domclick/blog/585936/) |
| <a href="https://futurragroup.com/" class="favicon">Futurra Group</a> | Analytics | — | — | — | [Article in Russian, December 2021](https://dou.ua/forums/topic/35587/) |
| <a href="https://usetech.com/" class="favicon">UseTech</a> | Software Development | — | — | — | [Job Posting, December 2021](https://vk.com/wall136266658_2418) |
| <a href="https://lookforsale.ru/" class="favicon">Lookforsale</a> | E-Commerce | — | — | — | [Job Posting, December 2021](https://telegram.me/javascript_jobs/587318) |
| <a href="https://rvision.pro/en/" class="favicon">R-Vision</a> | Information Security | — | — | — | [Article in Russian, December 2021](https://www.anti-malware.ru/reviews/R-Vision-SENSE-15) |
[Original article](https://clickhouse.com/docs/en/introduction/adopters/) <!--hide-->

View File

@ -9,11 +9,12 @@ The following operations with [projections](../../../engines/table-engines/merge
- `ALTER TABLE [db].name ADD PROJECTION name ( SELECT <COLUMN LIST EXPR> [GROUP BY] [ORDER BY] )` - Adds projection description to tables metadata.
- `ALTER TABLE [db].name DROP PROJECTION name` - Removes projection description from tables metadata and deletes projection files from disk.
- `ALTER TABLE [db].name DROP PROJECTION name` - Removes projection description from tables metadata and deletes projection files from disk. Implemented as a [mutation](../../../sql-reference/statements/alter/index.md#mutations).
- `ALTER TABLE [db.]table MATERIALIZE PROJECTION name IN PARTITION partition_name` - The query rebuilds the projection `name` in the partition `partition_name`. Implemented as a [mutation](../../../sql-reference/statements/alter/index.md#mutations).
- `ALTER TABLE [db.]table CLEAR PROJECTION name IN PARTITION partition_name` - Deletes projection files from disk without removing description.
- `ALTER TABLE [db.]table CLEAR PROJECTION name IN PARTITION partition_name` - Deletes projection files from disk without removing description. Implemented as a [mutation](../../../sql-reference/statements/alter/index.md#mutations).
The commands `ADD`, `DROP` and `CLEAR` are lightweight in a sense that they only change metadata or remove files.

View File

@ -3,10 +3,10 @@ toc_priority: 67
toc_title: NLP
---
# [экспериментально] Функции для работы с ествественным языком {#nlp-functions}
# [экспериментально] Функции для работы с естественным языком {#nlp-functions}
!!! warning "Предупреждение"
Сейчас использование функций для работы с ествественным языком является экспериментальной возможностью. Чтобы использовать данные функции, включите настройку `allow_experimental_nlp_functions = 1`.
Сейчас использование функций для работы с естественным языком является экспериментальной возможностью. Чтобы использовать данные функции, включите настройку `allow_experimental_nlp_functions = 1`.
## stem {#stem}
@ -84,7 +84,7 @@ SELECT lemmatize('en', 'wolves');
Находит синонимы к заданному слову. Представлены два типа расширений словарей: `plain` и `wordnet`.
Для работы расширения типа `plain` необходимо указать путь до простого текстового файла, где каждая строка соотвествует одному набору синонимов. Слова в данной строке должны быть разделены с помощью пробела или знака табуляции.
Для работы расширения типа `plain` необходимо указать путь до простого текстового файла, где каждая строка соответствует одному набору синонимов. Слова в данной строке должны быть разделены с помощью пробела или знака табуляции.
Для работы расширения типа `plain` необходимо указать путь до WordNet тезауруса. Тезаурус должен содержать WordNet sense index.

View File

@ -342,6 +342,9 @@ private:
}
}
/// Now we don't block the Ctrl+C signal and second signal will terminate the program without waiting.
interrupt_listener.unblock();
pool.wait();
total_watch.stop();
@ -586,7 +589,6 @@ public:
#ifndef __clang__
#pragma GCC optimize("-fno-var-tracking-assignments")
#endif
#pragma GCC diagnostic ignored "-Wmissing-declarations"
int mainEntryClickHouseBenchmark(int argc, char ** argv)
{

View File

@ -313,11 +313,11 @@ void LocalServer::cleanup()
std::string LocalServer::getInitialCreateTableQuery()
{
if (!config().has("table-structure"))
if (!config().has("table-structure") && !config().has("table-file"))
return {};
auto table_name = backQuoteIfNeed(config().getString("table-name", "table"));
auto table_structure = config().getString("table-structure");
auto table_structure = config().getString("table-structure", "auto");
auto data_format = backQuoteIfNeed(config().getString("table-data-format", "TSV"));
String table_file;
@ -332,7 +332,12 @@ std::string LocalServer::getInitialCreateTableQuery()
table_file = quoteString(config().getString("table-file"));
}
return fmt::format("CREATE TABLE {} ({}) ENGINE = File({}, {});",
if (table_structure == "auto")
table_structure = "";
else
table_structure = "(" + table_structure + ")";
return fmt::format("CREATE TABLE {} {} ENGINE = File({}, {});",
table_name, table_structure, data_format, table_file);
}
@ -422,7 +427,7 @@ try
#else
is_interactive = stdin_is_a_tty
&& (config().hasOption("interactive")
|| (!config().has("query") && !config().has("table-structure") && queries_files.empty()));
|| (!config().has("query") && !config().has("table-structure") && queries_files.empty() && !config().has("table-file")));
#endif
if (!is_interactive)
{

View File

@ -526,6 +526,14 @@ if (USE_BZIP2)
target_include_directories (clickhouse_common_io SYSTEM BEFORE PRIVATE ${BZIP2_INCLUDE_DIR})
endif()
if(USE_SIMDJSON)
dbms_target_link_libraries(PRIVATE simdjson)
endif()
if(USE_RAPIDJSON)
dbms_target_include_directories(SYSTEM PRIVATE ${RAPIDJSON_INCLUDE_DIR})
endif()
dbms_target_link_libraries(PUBLIC consistent-hashing)
include ("${ClickHouse_SOURCE_DIR}/cmake/add_check.cmake")

View File

@ -604,6 +604,7 @@
M(633, QUERY_IS_NOT_SUPPORTED_IN_WINDOW_VIEW) \
M(634, MONGODB_ERROR) \
M(635, CANNOT_POLL) \
M(636, CANNOT_EXTRACT_TABLE_STRUCTURE) \
\
M(999, KEEPER_EXCEPTION) \
M(1000, POCO_EXCEPTION) \

View File

@ -86,7 +86,7 @@ namespace
/// https://stackoverflow.com/questions/32088140/multiple-string-tables-in-elf-object
void updateResources(std::string_view name, const void * address, SymbolIndex::Resources & resources)
void updateResources(ElfW(Addr) base_address, std::string_view object_name, std::string_view name, const void * address, SymbolIndex::Resources & resources)
{
const char * char_address = static_cast<const char *>(address);
@ -97,18 +97,23 @@ void updateResources(std::string_view name, const void * address, SymbolIndex::R
name = name.substr((name[0] == '_') + strlen("binary_"));
name = name.substr(0, name.size() - strlen("_start"));
resources.emplace(name, std::string_view{char_address, 0}); // NOLINT
resources.emplace(name, SymbolIndex::ResourcesBlob{
base_address,
object_name,
std::string_view{char_address, 0}, // NOLINT
});
}
else if (name.ends_with("_end"))
{
name = name.substr((name[0] == '_') + strlen("binary_"));
name = name.substr(0, name.size() - strlen("_end"));
if (auto it = resources.find(name); it != resources.end() && it->second.empty())
auto it = resources.find(name);
if (it != resources.end() && it->second.base_address == base_address && it->second.data.empty())
{
const char * start = it->second.data();
const char * start = it->second.data.data();
assert(char_address >= start);
it->second = std::string_view{start, static_cast<size_t>(char_address - start)};
it->second.data = std::string_view{start, static_cast<size_t>(char_address - start)};
}
}
}
@ -153,10 +158,12 @@ void collectSymbolsFromProgramHeaders(
size_t sym_cnt = 0;
for (const auto * it = dyn_begin; it->d_tag != DT_NULL; ++it)
{
ElfW(Addr) base_address = correct_address(info->dlpi_addr, it->d_un.d_ptr);
// TODO: this branch leads to invalid address of the hash table. Need further investigation.
// if (it->d_tag == DT_HASH)
// {
// const ElfW(Word) * hash = reinterpret_cast<const ElfW(Word) *>(correct_address(info->dlpi_addr, it->d_un.d_ptr));
// const ElfW(Word) * hash = reinterpret_cast<const ElfW(Word) *>(base_address);
// sym_cnt = hash[1];
// break;
// }
@ -167,7 +174,7 @@ void collectSymbolsFromProgramHeaders(
const uint32_t * buckets = nullptr;
const uint32_t * hashval = nullptr;
const ElfW(Word) * hash = reinterpret_cast<const ElfW(Word) *>(correct_address(info->dlpi_addr, it->d_un.d_ptr));
const ElfW(Word) * hash = reinterpret_cast<const ElfW(Word) *>(base_address);
buckets = hash + 4 + (hash[2] * sizeof(size_t) / 4);
@ -196,9 +203,11 @@ void collectSymbolsFromProgramHeaders(
const char * strtab = nullptr;
for (const auto * it = dyn_begin; it->d_tag != DT_NULL; ++it)
{
ElfW(Addr) base_address = correct_address(info->dlpi_addr, it->d_un.d_ptr);
if (it->d_tag == DT_STRTAB)
{
strtab = reinterpret_cast<const char *>(correct_address(info->dlpi_addr, it->d_un.d_ptr));
strtab = reinterpret_cast<const char *>(base_address);
break;
}
}
@ -208,10 +217,12 @@ void collectSymbolsFromProgramHeaders(
for (const auto * it = dyn_begin; it->d_tag != DT_NULL; ++it)
{
ElfW(Addr) base_address = correct_address(info->dlpi_addr, it->d_un.d_ptr);
if (it->d_tag == DT_SYMTAB)
{
/* Get the pointer to the first entry of the symbol table */
const ElfW(Sym) * elf_sym = reinterpret_cast<const ElfW(Sym) *>(correct_address(info->dlpi_addr, it->d_un.d_ptr));
const ElfW(Sym) * elf_sym = reinterpret_cast<const ElfW(Sym) *>(base_address);
/* Iterate over the symbol table */
for (ElfW(Word) sym_index = 0; sym_index < ElfW(Word)(sym_cnt); ++sym_index)
@ -236,7 +247,7 @@ void collectSymbolsFromProgramHeaders(
symbols.push_back(symbol);
/// But resources can be represented by a pair of empty symbols (indicating their boundaries).
updateResources(symbol.name, symbol.address_begin, resources);
updateResources(base_address, info->dlpi_name, symbol.name, symbol.address_begin, resources);
}
break;
@ -299,7 +310,7 @@ void collectSymbolsFromELFSymbolTable(
if (symbol_table_entry->st_size)
symbols.push_back(symbol);
updateResources(symbol.name, symbol.address_begin, resources);
updateResources(info->dlpi_addr, info->dlpi_name, symbol.name, symbol.address_begin, resources);
}
}

View File

@ -51,7 +51,7 @@ public:
std::string_view getResource(String name) const
{
if (auto it = data.resources.find(name); it != data.resources.end())
return it->second;
return it->second.data;
return {};
}
@ -59,7 +59,17 @@ public:
String getBuildID() const { return data.build_id; }
String getBuildIDHex() const;
using Resources = std::unordered_map<std::string_view /* symbol name */, std::string_view /* blob */>;
struct ResourcesBlob
{
/// Symbol can be presented in multiple shared objects,
/// base_address will be used to compare only symbols from the same SO.
ElfW(Addr) base_address;
/// Just a human name of the SO.
std::string_view object_name;
/// Data blob.
std::string_view data;
};
using Resources = std::unordered_map<std::string_view /* symbol name */, ResourcesBlob>;
struct Data
{

View File

@ -26,6 +26,7 @@ namespace ErrorCodes
{
extern const int LOGICAL_ERROR;
extern const int NOT_IMPLEMENTED;
extern const int BAD_ARGUMENTS;
}
}
@ -1133,4 +1134,54 @@ Coordination::RequestPtr makeCheckRequest(const std::string & path, int version)
return request;
}
std::string normalizeZooKeeperPath(std::string zookeeper_path, bool check_starts_with_slash, Poco::Logger * log)
{
if (!zookeeper_path.empty() && zookeeper_path.back() == '/')
zookeeper_path.resize(zookeeper_path.size() - 1);
/// If zookeeper chroot prefix is used, path should start with '/', because chroot concatenates without it.
if (!zookeeper_path.empty() && zookeeper_path.front() != '/')
{
/// Do not allow this for new tables, print warning for tables created in old versions
if (check_starts_with_slash)
throw DB::Exception(DB::ErrorCodes::BAD_ARGUMENTS, "ZooKeeper path must starts with '/', got '{}'", zookeeper_path);
if (log)
LOG_WARNING(log, "ZooKeeper path ('{}') does not start with '/'. It will not be supported in future releases");
zookeeper_path = "/" + zookeeper_path;
}
return zookeeper_path;
}
String extractZooKeeperName(const String & path)
{
static constexpr auto default_zookeeper_name = "default";
if (path.empty())
throw DB::Exception("ZooKeeper path should not be empty", DB::ErrorCodes::BAD_ARGUMENTS);
if (path[0] == '/')
return default_zookeeper_name;
auto pos = path.find(":/");
if (pos != String::npos && pos < path.find('/'))
{
auto zookeeper_name = path.substr(0, pos);
if (zookeeper_name.empty())
throw DB::Exception("Zookeeper path should start with '/' or '<auxiliary_zookeeper_name>:/'", DB::ErrorCodes::BAD_ARGUMENTS);
return zookeeper_name;
}
return default_zookeeper_name;
}
String extractZooKeeperPath(const String & path, bool check_starts_with_slash, Poco::Logger * log)
{
if (path.empty())
throw DB::Exception("ZooKeeper path should not be empty", DB::ErrorCodes::BAD_ARGUMENTS);
if (path[0] == '/')
return normalizeZooKeeperPath(path, check_starts_with_slash, log);
auto pos = path.find(":/");
if (pos != String::npos && pos < path.find('/'))
{
return normalizeZooKeeperPath(path.substr(pos + 1, String::npos), check_starts_with_slash, log);
}
return normalizeZooKeeperPath(path, check_starts_with_slash, log);
}
}

View File

@ -379,4 +379,11 @@ private:
};
using EphemeralNodeHolderPtr = EphemeralNodeHolder::Ptr;
String normalizeZooKeeperPath(std::string zookeeper_path, bool check_starts_with_slash, Poco::Logger * log = nullptr);
String extractZooKeeperName(const String & path);
String extractZooKeeperPath(const String & path, bool check_starts_with_slash, Poco::Logger * log = nullptr);
}

View File

@ -596,6 +596,8 @@ class IColumn;
M(Int64, input_format_orc_row_batch_size, 100'000, "Batch size when reading ORC stripes.", 0) \
M(Bool, input_format_parquet_import_nested, false, "Allow to insert array of structs into Nested table in Parquet input format.", 0) \
M(Bool, input_format_allow_seeks, true, "Allow seeks while reading in ORC/Parquet/Arrow input formats", 0) \
M(UInt64, input_format_msgpack_number_of_columns, 0, "The number of columns in inserted MsgPack data. Used for automatic schema inference from data.", 0) \
M(UInt64, input_format_max_rows_to_read_for_schema_inference, 100, "The maximum rows of data to read for automatic schema inference", 0) \
\
M(DateTimeInputFormat, date_time_input_format, FormatSettings::DateTimeInputFormat::Basic, "Method to read DateTime from text input formats. Possible values: 'basic' and 'best_effort'.", 0) \
M(DateTimeOutputFormat, date_time_output_format, FormatSettings::DateTimeOutputFormat::Simple, "Method to write DateTime to text output. Possible values: 'simple', 'iso', 'unix_timestamp'.", 0) \
@ -661,6 +663,7 @@ class IColumn;
M(Bool, output_format_arrow_low_cardinality_as_dictionary, false, "Enable output LowCardinality type as Dictionary Arrow type", 0) \
\
M(EnumComparingMode, format_capn_proto_enum_comparising_mode, FormatSettings::EnumComparingMode::BY_VALUES, "How to map ClickHouse Enum and CapnProto Enum", 0)\
// End of FORMAT_FACTORY_SETTINGS
// Please add settings non-related to formats into the COMMON_SETTINGS above.

View File

@ -377,6 +377,8 @@ struct WhichDataType
constexpr bool isNullable() const { return idx == TypeIndex::Nullable; }
constexpr bool isFunction() const { return idx == TypeIndex::Function; }
constexpr bool isAggregateFunction() const { return idx == TypeIndex::AggregateFunction; }
constexpr bool isLowCarnality() const { return idx == TypeIndex::LowCardinality; }
};
/// IDataType helpers (alternative for IDataType virtual methods with single point of truth)

View File

@ -76,10 +76,16 @@ std::pair<String, StoragePtr> createTableFromAST(
/// - the database has not been loaded yet;
/// - the code is simpler, since the query is already brought to a suitable form.
if (!ast_create_query.columns_list || !ast_create_query.columns_list->columns)
throw Exception("Missing definition of columns.", ErrorCodes::EMPTY_LIST_OF_COLUMNS_PASSED);
columns = InterpreterCreateQuery::getColumnsDescription(*ast_create_query.columns_list->columns, context, true);
constraints = InterpreterCreateQuery::getConstraintsDescription(ast_create_query.columns_list->constraints);
{
if (!StorageFactory::instance().checkIfStorageSupportsSchemaInterface(ast_create_query.storage->engine->name))
throw Exception("Missing definition of columns.", ErrorCodes::EMPTY_LIST_OF_COLUMNS_PASSED);
/// Leave columns empty.
}
else
{
columns = InterpreterCreateQuery::getColumnsDescription(*ast_create_query.columns_list->columns, context, true);
constraints = InterpreterCreateQuery::getConstraintsDescription(ast_create_query.columns_list->constraints);
}
}
return

View File

@ -7,6 +7,8 @@
#include <DataTypes/DataTypeLowCardinality.h>
#include <DataTypes/DataTypeNullable.h>
#include <DataTypes/DataTypeTuple.h>
#include <DataTypes/DataTypesNumber.h>
#include <DataTypes/DataTypeString.h>
#include <DataTypes/IDataType.h>
#include <boost/algorithm/string.hpp>
#include <boost/algorithm/string/join.hpp>
@ -26,6 +28,7 @@ namespace ErrorCodes
extern const int FILE_DOESNT_EXIST;
extern const int UNKNOWN_EXCEPTION;
extern const int INCORRECT_DATA;
extern const int CANNOT_EXTRACT_TABLE_STRUCTURE;
}
capnp::StructSchema CapnProtoSchemaParser::getMessageSchema(const FormatSchemaInfo & schema_info)
@ -427,6 +430,113 @@ void checkCapnProtoSchemaStructure(const capnp::StructSchema & schema, const Blo
}
}
template <typename ValueType>
static DataTypePtr getEnumDataTypeFromEnumerants(const capnp::EnumSchema::EnumerantList & enumerants)
{
std::vector<std::pair<String, ValueType>> values;
for (auto enumerant : enumerants)
values.emplace_back(enumerant.getProto().getName(), ValueType(enumerant.getOrdinal()));
return std::make_shared<DataTypeEnum<ValueType>>(std::move(values));
}
static DataTypePtr getEnumDataTypeFromEnumSchema(const capnp::EnumSchema & enum_schema)
{
auto enumerants = enum_schema.getEnumerants();
if (enumerants.size() < 128)
return getEnumDataTypeFromEnumerants<Int8>(enumerants);
if (enumerants.size() < 32768)
return getEnumDataTypeFromEnumerants<Int16>(enumerants);
throw Exception(ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, "ClickHouse supports only 8 and 16-but Enums");
}
static DataTypePtr getDataTypeFromCapnProtoType(const capnp::Type & capnp_type)
{
switch (capnp_type.which())
{
case capnp::schema::Type::INT8:
return std::make_shared<DataTypeInt8>();
case capnp::schema::Type::INT16:
return std::make_shared<DataTypeInt16>();
case capnp::schema::Type::INT32:
return std::make_shared<DataTypeInt32>();
case capnp::schema::Type::INT64:
return std::make_shared<DataTypeInt64>();
case capnp::schema::Type::BOOL: [[fallthrough]];
case capnp::schema::Type::UINT8:
return std::make_shared<DataTypeUInt8>();
case capnp::schema::Type::UINT16:
return std::make_shared<DataTypeUInt16>();
case capnp::schema::Type::UINT32:
return std::make_shared<DataTypeUInt32>();
case capnp::schema::Type::UINT64:
return std::make_shared<DataTypeUInt64>();
case capnp::schema::Type::FLOAT32:
return std::make_shared<DataTypeFloat32>();
case capnp::schema::Type::FLOAT64:
return std::make_shared<DataTypeFloat64>();
case capnp::schema::Type::DATA: [[fallthrough]];
case capnp::schema::Type::TEXT:
return std::make_shared<DataTypeString>();
case capnp::schema::Type::ENUM:
return getEnumDataTypeFromEnumSchema(capnp_type.asEnum());
case capnp::schema::Type::LIST:
{
auto list_schema = capnp_type.asList();
auto nested_type = getDataTypeFromCapnProtoType(list_schema.getElementType());
return std::make_shared<DataTypeArray>(nested_type);
}
case capnp::schema::Type::STRUCT:
{
auto struct_schema = capnp_type.asStruct();
/// Check if it can be Nullable.
if (checkIfStructIsNamedUnion(struct_schema))
{
auto fields = struct_schema.getUnionFields();
if (fields.size() != 2 || (!fields[0].getType().isVoid() && !fields[1].getType().isVoid()))
throw Exception(ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, "Unions are not supported");
auto value_type = fields[0].getType().isVoid() ? fields[1].getType() : fields[0].getType();
if (value_type.isStruct() || value_type.isList())
throw Exception(ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, "Tuples and Lists cannot be inside Nullable");
auto nested_type = getDataTypeFromCapnProtoType(value_type);
return std::make_shared<DataTypeNullable>(nested_type);
}
if (checkIfStructContainsUnnamedUnion(struct_schema))
throw Exception(ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, "Unnamed union is not supported");
/// Treat Struct as Tuple.
DataTypes nested_types;
Names nested_names;
for (auto field : struct_schema.getNonUnionFields())
{
nested_names.push_back(field.getProto().getName());
nested_types.push_back(getDataTypeFromCapnProtoType(field.getType()));
}
return std::make_shared<DataTypeTuple>(std::move(nested_types), std::move(nested_names));
}
default:
throw Exception(ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, "Unsupported CapnProtoType: {}", getCapnProtoFullTypeName(capnp_type));
}
}
NamesAndTypesList capnProtoSchemaToCHSchema(const capnp::StructSchema & schema)
{
if (checkIfStructContainsUnnamedUnion(schema))
throw Exception(ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, "Unnamed union is not supported");
NamesAndTypesList names_and_types;
for (auto field : schema.getNonUnionFields())
{
auto name = field.getProto().getName();
auto type = getDataTypeFromCapnProtoType(field.getType());
names_and_types.emplace_back(name, type);
}
return names_and_types;
}
}
#endif

View File

@ -38,6 +38,7 @@ capnp::DynamicValue::Reader getReaderByColumnName(const capnp::DynamicStruct::Re
void checkCapnProtoSchemaStructure(const capnp::StructSchema & schema, const Block & header, FormatSettings::EnumComparingMode mode);
NamesAndTypesList capnProtoSchemaToCHSchema(const capnp::StructSchema & schema);
}
#endif

View File

@ -1,7 +1,16 @@
#include <Formats/EscapingRuleUtils.h>
#include <Formats/JSONEachRowUtils.h>
#include <Formats/ReadSchemaUtils.h>
#include <DataTypes/Serializations/SerializationNullable.h>
#include <DataTypes/DataTypeString.h>
#include <DataTypes/DataTypesNumber.h>
#include <DataTypes/DataTypeNullable.h>
#include <IO/ReadHelpers.h>
#include <IO/WriteHelpers.h>
#include <Poco/JSON/Parser.h>
#include <Parsers/TokenIterator.h>
#include <Parsers/ExpressionListParsers.h>
#include <Interpreters/evaluateConstantExpression.h>
namespace DB
{
@ -9,6 +18,7 @@ namespace DB
namespace ErrorCodes
{
extern const int BAD_ARGUMENTS;
extern const int LOGICAL_ERROR;
}
FormatSettings::EscapingRule stringToEscapingRule(const String & escaping_rule)
@ -193,30 +203,145 @@ void writeStringByEscapingRule(const String & value, WriteBuffer & out, FormatSe
}
}
String readStringByEscapingRule(ReadBuffer & buf, FormatSettings::EscapingRule escaping_rule, const FormatSettings & format_settings)
template <bool read_string>
String readByEscapingRule(ReadBuffer & buf, FormatSettings::EscapingRule escaping_rule, const FormatSettings & format_settings)
{
String result;
switch (escaping_rule)
{
case FormatSettings::EscapingRule::Quoted:
readQuotedString(result, buf);
if constexpr (read_string)
readQuotedString(result, buf);
else
readQuotedFieldIntoString(result, buf);
break;
case FormatSettings::EscapingRule::JSON:
readJSONString(result, buf);
if constexpr (read_string)
readJSONString(result, buf);
else
readJSONFieldIntoString(result, buf);
break;
case FormatSettings::EscapingRule::Raw:
readString(result, buf);
break;
case FormatSettings::EscapingRule::CSV:
readCSVString(result, buf, format_settings.csv);
if constexpr (read_string)
readCSVString(result, buf, format_settings.csv);
else
readCSVField(result, buf, format_settings.csv);
break;
case FormatSettings::EscapingRule::Escaped:
readEscapedString(result, buf);
break;
default:
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Cannot read string with {} escaping rule", escapingRuleToString(escaping_rule));
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Cannot read value with {} escaping rule", escapingRuleToString(escaping_rule));
}
return result;
}
String readFieldByEscapingRule(ReadBuffer & buf, FormatSettings::EscapingRule escaping_rule, const FormatSettings & format_settings)
{
return readByEscapingRule<false>(buf, escaping_rule, format_settings);
}
String readStringByEscapingRule(ReadBuffer & buf, FormatSettings::EscapingRule escaping_rule, const FormatSettings & format_settings)
{
return readByEscapingRule<true>(buf, escaping_rule, format_settings);
}
static bool evaluateConstantExpressionFromString(const StringRef & field, DataTypePtr & type, ContextPtr context)
{
if (!context)
throw Exception(ErrorCodes::LOGICAL_ERROR, "You must provide context to evaluate constant expression");
ParserExpression parser;
Expected expected;
Tokens tokens(field.data, field.data + field.size);
IParser::Pos token_iterator(tokens, context->getSettingsRef().max_parser_depth);
ASTPtr ast;
/// FIXME: Our parser cannot parse maps in the form of '{key : value}' that is used in text formats.
bool parsed = parser.parse(token_iterator, ast, expected);
if (!parsed)
return false;
try
{
std::pair<Field, DataTypePtr> result = evaluateConstantExpression(ast, context);
type = generalizeDataType(result.second);
return true;
}
catch (...)
{
return false;
}
}
DataTypePtr determineDataTypeByEscapingRule(const String & field, const FormatSettings & format_settings, FormatSettings::EscapingRule escaping_rule, ContextPtr context)
{
switch (escaping_rule)
{
case FormatSettings::EscapingRule::Quoted:
{
DataTypePtr type;
bool parsed = evaluateConstantExpressionFromString(field, type, context);
return parsed ? type : nullptr;
}
case FormatSettings::EscapingRule::JSON:
return getDataTypeFromJSONField(field);
case FormatSettings::EscapingRule::CSV:
{
if (field.empty() || field == format_settings.csv.null_representation)
return nullptr;
if (field == format_settings.bool_false_representation || field == format_settings.bool_true_representation)
return std::make_shared<DataTypeUInt8>();
DataTypePtr type;
bool parsed;
if (field[0] == '\'' || field[0] == '"')
{
/// Try to evaluate expression inside quotes.
parsed = evaluateConstantExpressionFromString(StringRef(field.data() + 1, field.size() - 2), type, context);
/// If it's a number in quotes we determine it as a string.
if (parsed && type && isNumber(removeNullable(type)))
return makeNullable(std::make_shared<DataTypeString>());
}
else
parsed = evaluateConstantExpressionFromString(field, type, context);
/// If we couldn't parse an expression, determine it as a string.
return parsed ? type : makeNullable(std::make_shared<DataTypeString>());
}
case FormatSettings::EscapingRule::Raw: [[fallthrough]];
case FormatSettings::EscapingRule::Escaped:
/// TODO: Try to use some heuristics here to determine the type of data.
return field.empty() ? nullptr : makeNullable(std::make_shared<DataTypeString>());
default:
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Cannot determine the type for value with {} escaping rule", escapingRuleToString(escaping_rule));
}
}
DataTypes determineDataTypesByEscapingRule(const std::vector<String> & fields, const FormatSettings & format_settings, FormatSettings::EscapingRule escaping_rule, ContextPtr context)
{
DataTypes data_types;
data_types.reserve(fields.size());
for (const auto & field : fields)
data_types.push_back(determineDataTypeByEscapingRule(field, format_settings, escaping_rule, context));
return data_types;
}
DataTypePtr getDefaultDataTypeForEscapingRule(FormatSettings::EscapingRule escaping_rule)
{
switch (escaping_rule)
{
case FormatSettings::EscapingRule::CSV: [[fallthrough]];
case FormatSettings::EscapingRule::Escaped: [[fallthrough]];
case FormatSettings::EscapingRule::Raw:
return makeNullable(std::make_shared<DataTypeString>());
default:
return nullptr;
}
}
}

View File

@ -4,6 +4,7 @@
#include <DataTypes/IDataType.h>
#include <DataTypes/Serializations/ISerialization.h>
#include <IO/ReadBuffer.h>
#include <Interpreters/Context.h>
namespace DB
{
@ -33,5 +34,24 @@ void serializeFieldByEscapingRule(
void writeStringByEscapingRule(const String & value, WriteBuffer & out, FormatSettings::EscapingRule escaping_rule, const FormatSettings & format_settings);
String readStringByEscapingRule(ReadBuffer & buf, FormatSettings::EscapingRule escaping_rule, const FormatSettings & format_settings);
String readFieldByEscapingRule(ReadBuffer & buf, FormatSettings::EscapingRule escaping_rule, const FormatSettings & format_settings);
/// Try to determine the type of the field written by a specific escaping rule.
/// If cannot, return nullptr.
/// - For Quoted escaping rule we can interpret a single field as a constant
/// expression and get it's type by evaluation this expression.
/// - For JSON escaping rule we can use JSON parser to parse a single field
/// and then convert JSON type of this field to ClickHouse type.
/// - For CSV escaping rule we can do the next:
/// - If the field is an unquoted string, then we could try to evaluate it
/// as a constant expression, and if it fails, treat it as a String.
/// - If the field is a string in quotes, then we can try to evaluate
/// expression inside quotes as a constant expression, and if it fails or
/// the result is a number (we don't parse numbers in quotes) we treat it as a String.
/// - For TSV and TSVRaw we treat each field as a String (TODO: try to use some tweaks and heuristics here)
DataTypePtr determineDataTypeByEscapingRule(const String & field, const FormatSettings & format_settings, FormatSettings::EscapingRule escaping_rule, ContextPtr context = nullptr);
DataTypes determineDataTypesByEscapingRule(const std::vector<String> & fields, const FormatSettings & format_settings, FormatSettings::EscapingRule escaping_rule, ContextPtr context = nullptr);
DataTypePtr getDefaultDataTypeForEscapingRule(FormatSettings::EscapingRule escaping_rule);
}

View File

@ -14,9 +14,6 @@
#include <Poco/URI.h>
#include <Common/Exception.h>
#include <IO/BufferWithOwnMemory.h>
#include <IO/ReadHelpers.h>
namespace DB
{
@ -120,6 +117,8 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings)
format_settings.defaults_for_omitted_fields = settings.input_format_defaults_for_omitted_fields;
format_settings.capn_proto.enum_comparing_mode = settings.format_capn_proto_enum_comparising_mode;
format_settings.seekable_read = settings.input_format_allow_seeks;
format_settings.msgpack.number_of_columns = settings.input_format_msgpack_number_of_columns;
format_settings.max_rows_to_read_for_schema_inference = settings.input_format_max_rows_to_read_for_schema_inference;
/// Validate avro_schema_registry_url with RemoteHostFilter when non-empty and in Server context
if (format_settings.schema.is_server)
@ -201,7 +200,6 @@ InputFormatPtr FormatFactory::getInput(
return format;
}
InputFormatPtr FormatFactory::getInputFormat(
const String & name,
ReadBuffer & buf,
@ -342,6 +340,32 @@ String FormatFactory::getContentType(
return format->getContentType();
}
SchemaReaderPtr FormatFactory::getSchemaReader(
const String & name,
ReadBuffer & buf,
ContextPtr context,
const std::optional<FormatSettings> & _format_settings) const
{
const auto & schema_reader_creator = dict.at(name).schema_reader_creator;
if (!schema_reader_creator)
throw Exception("FormatFactory: Format " + name + " doesn't support schema inference.", ErrorCodes::LOGICAL_ERROR);
auto format_settings = _format_settings ? *_format_settings : getFormatSettings(context);
return schema_reader_creator(buf, format_settings, context);
}
ExternalSchemaReaderPtr FormatFactory::getExternalSchemaReader(
const String & name,
ContextPtr context,
const std::optional<FormatSettings> & _format_settings) const
{
const auto & external_schema_reader_creator = dict.at(name).external_schema_reader_creator;
if (!external_schema_reader_creator)
throw Exception("FormatFactory: Format " + name + " doesn't support schema inference.", ErrorCodes::LOGICAL_ERROR);
auto format_settings = _format_settings ? *_format_settings : getFormatSettings(context);
return external_schema_reader_creator(format_settings);
}
void FormatFactory::registerInputFormat(const String & name, InputCreator input_creator)
{
@ -375,6 +399,21 @@ void FormatFactory::registerFileSegmentationEngine(const String & name, FileSegm
target = std::move(file_segmentation_engine);
}
void FormatFactory::registerSchemaReader(const String & name, SchemaReaderCreator schema_reader_creator)
{
auto & target = dict[name].schema_reader_creator;
if (target)
throw Exception("FormatFactory: Schema reader " + name + " is already registered", ErrorCodes::LOGICAL_ERROR);
target = std::move(schema_reader_creator);
}
void FormatFactory::registerExternalSchemaReader(const String & name, ExternalSchemaReaderCreator external_schema_reader_creator)
{
auto & target = dict[name].external_schema_reader_creator;
if (target)
throw Exception("FormatFactory: Schema reader " + name + " is already registered", ErrorCodes::LOGICAL_ERROR);
target = std::move(external_schema_reader_creator);
}
void FormatFactory::markOutputFormatSupportsParallelFormatting(const String & name)
{
@ -412,6 +451,23 @@ bool FormatFactory::isOutputFormat(const String & name) const
return it != dict.end() && it->second.output_creator;
}
bool FormatFactory::checkIfFormatHasSchemaReader(const String & name)
{
const auto & target = getCreators(name);
return bool(target.schema_reader_creator);
}
bool FormatFactory::checkIfFormatHasExternalSchemaReader(const String & name)
{
const auto & target = getCreators(name);
return bool(target.external_schema_reader_creator);
}
bool FormatFactory::checkIfFormatHasAnySchemaReader(const String & name)
{
return checkIfFormatHasSchemaReader(name) || checkIfFormatHasExternalSchemaReader(name);
}
FormatFactory & FormatFactory::instance()
{
static FormatFactory ret;

View File

@ -4,7 +4,9 @@
#include <Columns/IColumn.h>
#include <Formats/FormatSettings.h>
#include <Interpreters/Context_fwd.h>
#include <IO/BufferWithOwnMemory.h>
#include <base/types.h>
#include <Core/NamesAndTypes.h>
#include <boost/noncopyable.hpp>
@ -31,6 +33,11 @@ class IOutputFormat;
struct RowInputFormatParams;
struct RowOutputFormatParams;
class ISchemaReader;
class IExternalSchemaReader;
using SchemaReaderPtr = std::shared_ptr<ISchemaReader>;
using ExternalSchemaReaderPtr = std::shared_ptr<IExternalSchemaReader>;
using InputFormatPtr = std::shared_ptr<IInputFormat>;
using OutputFormatPtr = std::shared_ptr<IOutputFormat>;
@ -85,11 +92,16 @@ private:
/// The checker should return true if parallel parsing should be disabled.
using NonTrivialPrefixAndSuffixChecker = std::function<bool(ReadBuffer & buf)>;
using SchemaReaderCreator = std::function<SchemaReaderPtr(ReadBuffer & in, const FormatSettings & settings, ContextPtr context)>;
using ExternalSchemaReaderCreator = std::function<ExternalSchemaReaderPtr(const FormatSettings & settings)>;
struct Creators
{
InputCreator input_creator;
OutputCreator output_creator;
FileSegmentationEngine file_segmentation_engine;
SchemaReaderCreator schema_reader_creator;
ExternalSchemaReaderCreator external_schema_reader_creator;
bool supports_parallel_formatting{false};
bool is_column_oriented{false};
NonTrivialPrefixAndSuffixChecker non_trivial_prefix_and_suffix_checker;
@ -138,6 +150,17 @@ public:
ContextPtr context,
const std::optional<FormatSettings> & format_settings = std::nullopt) const;
SchemaReaderPtr getSchemaReader(
const String & name,
ReadBuffer & buf,
ContextPtr context,
const std::optional<FormatSettings> & format_settings = std::nullopt) const;
ExternalSchemaReaderPtr getExternalSchemaReader(
const String & name,
ContextPtr context,
const std::optional<FormatSettings> & format_settings = std::nullopt) const;
void registerFileSegmentationEngine(const String & name, FileSegmentationEngine file_segmentation_engine);
void registerNonTrivialPrefixAndSuffixChecker(const String & name, NonTrivialPrefixAndSuffixChecker non_trivial_prefix_and_suffix_checker);
@ -146,11 +169,19 @@ public:
void registerInputFormat(const String & name, InputCreator input_creator);
void registerOutputFormat(const String & name, OutputCreator output_creator);
/// Register schema readers for format its name.
void registerSchemaReader(const String & name, SchemaReaderCreator schema_reader_creator);
void registerExternalSchemaReader(const String & name, ExternalSchemaReaderCreator external_schema_reader_creator);
void markOutputFormatSupportsParallelFormatting(const String & name);
void markFormatAsColumnOriented(const String & name);
bool checkIfFormatIsColumnOriented(const String & name);
bool checkIfFormatHasSchemaReader(const String & name);
bool checkIfFormatHasExternalSchemaReader(const String & name);
bool checkIfFormatHasAnySchemaReader(const String & name);
const FormatsDictionary & getAllFormats() const
{
return dict;
@ -163,6 +194,7 @@ private:
FormatsDictionary dict;
const Creators & getCreators(const String & name) const;
};
}

View File

@ -33,6 +33,7 @@ struct FormatSettings
bool defaults_for_omitted_fields = true;
bool seekable_read = true;
UInt64 max_rows_to_read_for_schema_inference = 100;
enum class DateTimeInputFormat
{
@ -217,6 +218,11 @@ struct FormatSettings
{
EnumComparingMode enum_comparing_mode = EnumComparingMode::BY_VALUES;
} capn_proto;
struct
{
UInt64 number_of_columns = 0;
} msgpack;
};
}

View File

@ -1,7 +1,17 @@
#include <IO/ReadHelpers.h>
#include <Formats/JSONEachRowUtils.h>
#include <Formats/ReadSchemaUtils.h>
#include <IO/ReadBufferFromString.h>
#include <DataTypes/Serializations/SerializationNullable.h>
#include <DataTypes/DataTypeNullable.h>
#include <DataTypes/DataTypesNumber.h>
#include <DataTypes/DataTypeString.h>
#include <DataTypes/DataTypeArray.h>
#include <DataTypes/DataTypeTuple.h>
#include <DataTypes/DataTypeMap.h>
#include <Functions/SimdJSONParser.h>
#include <Functions/RapidJSONParser.h>
#include <Functions/DummyJSONParser.h>
#include <base/find_symbols.h>
@ -26,7 +36,7 @@ static std::pair<bool, size_t> fileSegmentationEngineJSONEachRowImpl(ReadBuffer
while (loadAtPosition(in, memory, pos) && (balance || memory.size() + static_cast<size_t>(pos - in.position()) < min_chunk_size || number_of_rows < min_rows))
{
const auto current_object_size = memory.size() + static_cast<size_t>(pos - in.position());
if (current_object_size > 10 * min_chunk_size)
if (min_chunk_size != 0 && current_object_size > 10 * min_chunk_size)
throw ParsingException("Size of JSON object is extremely large. Expected not greater than " +
std::to_string(min_chunk_size) + " bytes, but current is " + std::to_string(current_object_size) +
" bytes per row. Increase the value setting 'min_chunk_bytes_for_parallel_parsing' or check your data manually, most likely JSON is malformed", ErrorCodes::INCORRECT_DATA);
@ -92,6 +102,122 @@ static std::pair<bool, size_t> fileSegmentationEngineJSONEachRowImpl(ReadBuffer
return {loadAtPosition(in, memory, pos), number_of_rows};
}
template <const char opening_bracket, const char closing_bracket>
static String readJSONEachRowLineIntoStringImpl(ReadBuffer & in)
{
Memory memory;
fileSegmentationEngineJSONEachRowImpl<opening_bracket, closing_bracket>(in, memory, 0, 1);
return String(memory.data(), memory.size());
}
template <class Element>
DataTypePtr getDataTypeFromJSONFieldImpl(const Element & field)
{
if (field.isNull())
return nullptr;
if (field.isBool())
return makeNullable(std::make_shared<DataTypeUInt8>());
if (field.isInt64() || field.isUInt64() || field.isDouble())
return makeNullable(std::make_shared<DataTypeFloat64>());
if (field.isString())
return makeNullable(std::make_shared<DataTypeString>());
if (field.isArray())
{
auto array = field.getArray();
/// Return nullptr in case of empty array because we cannot determine nested type.
if (array.size() == 0)
return nullptr;
DataTypes nested_data_types;
/// If this array contains fields with different types we will treat it as Tuple.
bool is_tuple = false;
for (const auto element : array)
{
auto type = getDataTypeFromJSONFieldImpl(element);
if (!type)
return nullptr;
if (!nested_data_types.empty() && type->getName() != nested_data_types.back()->getName())
is_tuple = true;
nested_data_types.push_back(std::move(type));
}
if (is_tuple)
return std::make_shared<DataTypeTuple>(nested_data_types);
return std::make_shared<DataTypeArray>(nested_data_types.back());
}
if (field.isObject())
{
auto object = field.getObject();
DataTypePtr value_type;
for (const auto key_value_pair : object)
{
auto type = getDataTypeFromJSONFieldImpl(key_value_pair.second);
if (!type)
return nullptr;
if (value_type && value_type->getName() != type->getName())
return nullptr;
value_type = type;
}
return std::make_shared<DataTypeMap>(std::make_shared<DataTypeString>(), value_type);
}
throw Exception{ErrorCodes::INCORRECT_DATA, "Unexpected JSON type"};
}
auto getJSONParserAndElement()
{
#if USE_SIMDJSON
return std::pair<SimdJSONParser, SimdJSONParser::Element>();
#elif USE_RAPIDJSON
return std::pair<RapidJSONParser, RapidJSONParser::Element>();
#else
return std::pair<DummyJSONParser, DummyJSONParser::Element>();
#endif
}
DataTypePtr getDataTypeFromJSONField(const String & field)
{
auto [parser, element] = getJSONParserAndElement();
bool parsed = parser.parse(field, element);
if (!parsed)
throw Exception(ErrorCodes::INCORRECT_DATA, "Cannot parse JSON object");
return getDataTypeFromJSONFieldImpl(element);
}
template <class Extractor, const char opening_bracket, const char closing_bracket>
static DataTypes determineColumnDataTypesFromJSONEachRowDataImpl(ReadBuffer & in, bool /*json_strings*/, Extractor & extractor)
{
String line = readJSONEachRowLineIntoStringImpl<opening_bracket, closing_bracket>(in);
auto [parser, element] = getJSONParserAndElement();
bool parsed = parser.parse(line, element);
if (!parsed)
throw Exception(ErrorCodes::INCORRECT_DATA, "Cannot parse JSON object");
auto fields = extractor.extract(element);
DataTypes data_types;
data_types.reserve(fields.size());
for (const auto & field : fields)
data_types.push_back(getDataTypeFromJSONFieldImpl(field));
/// TODO: For JSONStringsEachRow/JSONCompactStringsEach all types will be strings.
/// Should we try to parse data inside strings somehow in this case?
return data_types;
}
std::pair<bool, size_t> fileSegmentationEngineJSONEachRow(ReadBuffer & in, DB::Memory<> & memory, size_t min_chunk_size)
{
return fileSegmentationEngineJSONEachRowImpl<'{', '}'>(in, memory, min_chunk_size, 1);
@ -102,6 +228,60 @@ std::pair<bool, size_t> fileSegmentationEngineJSONCompactEachRow(ReadBuffer & in
return fileSegmentationEngineJSONEachRowImpl<'[', ']'>(in, memory, min_chunk_size, min_rows);
}
struct JSONEachRowFieldsExtractor
{
template <class Element>
std::vector<Element> extract(const Element & element)
{
/// {..., "<column_name>" : <value>, ...}
auto object = element.getObject();
std::vector<Element> fields;
fields.reserve(object.size());
column_names.reserve(object.size());
for (const auto & key_value_pair : object)
{
column_names.emplace_back(key_value_pair.first);
fields.push_back(key_value_pair.second);
}
return fields;
}
std::vector<String> column_names;
};
std::unordered_map<String, DataTypePtr> readRowAndGetNamesAndDataTypesForJSONEachRow(ReadBuffer & in, bool json_strings)
{
JSONEachRowFieldsExtractor extractor;
auto data_types = determineColumnDataTypesFromJSONEachRowDataImpl<JSONEachRowFieldsExtractor, '{', '}'>(in, json_strings, extractor);
std::unordered_map<String, DataTypePtr> result;
for (size_t i = 0; i != extractor.column_names.size(); ++i)
result[extractor.column_names[i]] = data_types[i];
return result;
}
struct JSONCompactEachRowFieldsExtractor
{
template <class Element>
std::vector<Element> extract(const Element & element)
{
/// [..., <value>, ...]
auto array = element.getArray();
std::vector<Element> fields;
fields.reserve(array.size());
for (size_t i = 0; i != array.size(); ++i)
fields.push_back(array[i]);
return fields;
}
};
DataTypes readRowAndGetDataTypesForJSONCompactEachRow(ReadBuffer & in, bool json_strings)
{
JSONCompactEachRowFieldsExtractor extractor;
return determineColumnDataTypesFromJSONEachRowDataImpl<JSONCompactEachRowFieldsExtractor, '[', ']'>(in, json_strings, extractor);
}
bool nonTrivialPrefixAndSuffixCheckerJSONEachRowImpl(ReadBuffer & buf)
{
/// For JSONEachRow we can safely skip whitespace characters

View File

@ -11,6 +11,21 @@ namespace DB
std::pair<bool, size_t> fileSegmentationEngineJSONEachRow(ReadBuffer & in, DB::Memory<> & memory, size_t min_chunk_size);
std::pair<bool, size_t> fileSegmentationEngineJSONCompactEachRow(ReadBuffer & in, DB::Memory<> & memory, size_t min_chunk_size, size_t min_rows);
/// Parse JSON from string and convert it's type to ClickHouse type. Make the result type always Nullable.
/// JSON array with different nested types is treated as Tuple.
/// If cannot convert (for example when field contains null), return nullptr.
DataTypePtr getDataTypeFromJSONField(const String & field);
/// Read row in JSONEachRow format and try to determine type for each field.
/// Return map {column_name : type}.
/// If cannot determine the type of some field, return nullptr for it.
std::unordered_map<String, DataTypePtr> readRowAndGetNamesAndDataTypesForJSONEachRow(ReadBuffer & in, bool json_strings);
/// Read row in JSONCompactEachRow format and try to determine type for each field.
/// If cannot determine the type of some field, return nullptr for it.
DataTypes readRowAndGetDataTypesForJSONCompactEachRow(ReadBuffer & in, bool json_strings);
bool nonTrivialPrefixAndSuffixCheckerJSONEachRowImpl(ReadBuffer & buf);
bool readFieldImpl(ReadBuffer & in, IColumn & column, const DataTypePtr & type, const SerializationPtr & serialization, const String & column_name, const FormatSettings & format_settings, bool yield_strings);

View File

@ -14,14 +14,14 @@ namespace ErrorCodes
extern const int INVALID_TEMPLATE_FORMAT;
}
ParsedTemplateFormatString::ParsedTemplateFormatString(const FormatSchemaInfo & schema, const ColumnIdxGetter & idx_by_name)
ParsedTemplateFormatString::ParsedTemplateFormatString(const FormatSchemaInfo & schema, const ColumnIdxGetter & idx_by_name, bool allow_indexes)
{
ReadBufferFromFile schema_file(schema.absoluteSchemaPath(), 4096);
String format_string;
readStringUntilEOF(format_string, schema_file);
try
{
parse(format_string, idx_by_name);
parse(format_string, idx_by_name, allow_indexes);
}
catch (DB::Exception & e)
{
@ -33,7 +33,7 @@ ParsedTemplateFormatString::ParsedTemplateFormatString(const FormatSchemaInfo &
}
void ParsedTemplateFormatString::parse(const String & format_string, const ColumnIdxGetter & idx_by_name)
void ParsedTemplateFormatString::parse(const String & format_string, const ColumnIdxGetter & idx_by_name, bool allow_indexes)
{
enum ParserState
{
@ -100,6 +100,8 @@ void ParsedTemplateFormatString::parse(const String & format_string, const Colum
column_idx = strtoull(column_names.back().c_str(), &col_idx_end, 10);
if (col_idx_end != column_names.back().c_str() + column_names.back().size() || errno)
column_idx = idx_by_name(column_names.back());
else if (!allow_indexes)
throw Exception(ErrorCodes::INVALID_TEMPLATE_FORMAT, "Indexes instead of names are not allowed");
}
format_idx_to_column_idx.emplace_back(column_idx);
break;

View File

@ -31,9 +31,9 @@ struct ParsedTemplateFormatString
typedef std::function<std::optional<size_t>(const String &)> ColumnIdxGetter;
ParsedTemplateFormatString() = default;
ParsedTemplateFormatString(const FormatSchemaInfo & schema, const ColumnIdxGetter & idx_by_name);
ParsedTemplateFormatString(const FormatSchemaInfo & schema, const ColumnIdxGetter & idx_by_name, bool allow_indexes = true);
void parse(const String & format_string, const ColumnIdxGetter & idx_by_name);
void parse(const String & format_string, const ColumnIdxGetter & idx_by_name, bool allow_indexes = true);
static const char * readMayBeQuotedColumnNameInto(const char * pos, size_t size, String & s);
size_t columnsCount() const;

View File

@ -24,6 +24,7 @@
# include <DataTypes/DataTypeMap.h>
# include <DataTypes/DataTypeNullable.h>
# include <DataTypes/DataTypeTuple.h>
# include <DataTypes/DataTypeString.h>
# include <DataTypes/Serializations/SerializationDecimal.h>
# include <DataTypes/Serializations/SerializationFixedString.h>
# include <Formats/ProtobufReader.h>
@ -56,6 +57,7 @@ namespace ErrorCodes
extern const int PROTOBUF_FIELD_NOT_REPEATED;
extern const int PROTOBUF_BAD_CAST;
extern const int LOGICAL_ERROR;
extern const int BAD_ARGUMENTS;
}
namespace
@ -3017,10 +3019,8 @@ namespace
{
std::vector<std::string_view> column_names_used;
column_names_used.reserve(used_column_indices_in_nested.size());
for (size_t i : used_column_indices_in_nested)
column_names_used.emplace_back(nested_column_names[i]);
auto field_serializer = std::make_unique<ProtobufSerializerFlattenedNestedAsArrayOfNestedMessages>(
std::move(column_names_used), field_descriptor, std::move(nested_message_serializer), get_root_desc_function);
transformColumnIndices(used_column_indices_in_nested, nested_column_indices);
@ -3230,8 +3230,105 @@ namespace
std::function<String(size_t)> get_root_desc_function;
std::shared_ptr<ProtobufSerializer *> root_serializer_ptr;
};
}
template <typename Type>
DataTypePtr getEnumDataType(const google::protobuf::EnumDescriptor * enum_descriptor)
{
std::vector<std::pair<String, Type>> values;
for (int i = 0; i != enum_descriptor->value_count(); ++i)
{
const auto * enum_value_descriptor = enum_descriptor->value(i);
values.emplace_back(enum_value_descriptor->name(), enum_value_descriptor->number());
}
return std::make_shared<DataTypeEnum<Type>>(std::move(values));
}
NameAndTypePair getNameAndDataTypeFromField(const google::protobuf::FieldDescriptor * field_descriptor, bool allow_repeat = true)
{
if (allow_repeat && field_descriptor->is_map())
{
auto name_and_type = getNameAndDataTypeFromField(field_descriptor, false);
const auto * tuple_type = assert_cast<const DataTypeTuple *>(name_and_type.type.get());
return {name_and_type.name, std::make_shared<DataTypeMap>(tuple_type->getElements())};
}
if (allow_repeat && field_descriptor->is_repeated())
{
auto name_and_type = getNameAndDataTypeFromField(field_descriptor, false);
return {name_and_type.name, std::make_shared<DataTypeArray>(name_and_type.type)};
}
switch (field_descriptor->type())
{
case FieldTypeId::TYPE_SFIXED32: [[fallthrough]];
case FieldTypeId::TYPE_SINT32: [[fallthrough]];
case FieldTypeId::TYPE_INT32:
return {field_descriptor->name(), std::make_shared<DataTypeInt32>()};
case FieldTypeId::TYPE_SFIXED64: [[fallthrough]];
case FieldTypeId::TYPE_SINT64: [[fallthrough]];
case FieldTypeId::TYPE_INT64:
return {field_descriptor->name(), std::make_shared<DataTypeInt64>()};
case FieldTypeId::TYPE_BOOL:
return {field_descriptor->name(), std::make_shared<DataTypeUInt8>()};
case FieldTypeId::TYPE_FLOAT:
return {field_descriptor->name(), std::make_shared<DataTypeFloat32>()};
case FieldTypeId::TYPE_DOUBLE:
return {field_descriptor->name(), std::make_shared<DataTypeFloat64>()};
case FieldTypeId::TYPE_UINT32: [[fallthrough]];
case FieldTypeId::TYPE_FIXED32:
return {field_descriptor->name(), std::make_shared<DataTypeUInt32>()};
case FieldTypeId::TYPE_UINT64: [[fallthrough]];
case FieldTypeId::TYPE_FIXED64:
return {field_descriptor->name(), std::make_shared<DataTypeUInt64>()};
case FieldTypeId::TYPE_BYTES: [[fallthrough]];
case FieldTypeId::TYPE_STRING:
return {field_descriptor->name(), std::make_shared<DataTypeString>()};
case FieldTypeId::TYPE_ENUM:
{
const auto * enum_descriptor = field_descriptor->enum_type();
if (enum_descriptor->value_count() == 0)
throw Exception("Empty enum field", ErrorCodes::BAD_ARGUMENTS);
int max_abs = std::abs(enum_descriptor->value(0)->number());
for (int i = 1; i != enum_descriptor->value_count(); ++i)
{
if (std::abs(enum_descriptor->value(i)->number()) > max_abs)
max_abs = std::abs(enum_descriptor->value(i)->number());
}
if (max_abs < 128)
return {field_descriptor->name(), getEnumDataType<Int8>(enum_descriptor)};
else if (max_abs < 32768)
return {field_descriptor->name(), getEnumDataType<Int16>(enum_descriptor)};
else
throw Exception("ClickHouse supports only 8-bit and 16-bit enums", ErrorCodes::BAD_ARGUMENTS);
}
case FieldTypeId::TYPE_GROUP: [[fallthrough]];
case FieldTypeId::TYPE_MESSAGE:
{
const auto * message_descriptor = field_descriptor->message_type();
if (message_descriptor->field_count() == 1)
{
const auto * nested_field_descriptor = message_descriptor->field(0);
auto nested_name_and_type = getNameAndDataTypeFromField(nested_field_descriptor);
return {field_descriptor->name() + "_" + nested_name_and_type.name, nested_name_and_type.type};
}
else
{
DataTypes nested_types;
Strings nested_names;
for (int i = 0; i != message_descriptor->field_count(); ++i)
{
auto nested_name_and_type = getNameAndDataTypeFromField(message_descriptor->field(i));
nested_types.push_back(nested_name_and_type.type);
nested_names.push_back(nested_name_and_type.name);
}
return {field_descriptor->name(), std::make_shared<DataTypeTuple>(std::move(nested_types), std::move(nested_names))};
}
}
}
__builtin_unreachable();
}
}
std::unique_ptr<ProtobufSerializer> ProtobufSerializer::create(
const Strings & column_names,
@ -3254,5 +3351,14 @@ std::unique_ptr<ProtobufSerializer> ProtobufSerializer::create(
std::vector<size_t> missing_column_indices;
return ProtobufSerializerBuilder(writer).buildMessageSerializer(column_names, data_types, missing_column_indices, message_descriptor, with_length_delimiter);
}
NamesAndTypesList protobufSchemaToCHSchema(const google::protobuf::Descriptor * message_descriptor)
{
NamesAndTypesList schema;
for (int i = 0; i != message_descriptor->field_count(); ++i)
schema.push_back(getNameAndDataTypeFromField(message_descriptor->field(i)));
return schema;
}
}
#endif

View File

@ -4,6 +4,7 @@
#if USE_PROTOBUF
# include <Columns/IColumn.h>
#include <Core/NamesAndTypes.h>
namespace google::protobuf { class Descriptor; }
@ -48,5 +49,7 @@ public:
ProtobufWriter & writer);
};
NamesAndTypesList protobufSchemaToCHSchema(const google::protobuf::Descriptor * message_descriptor);
}
#endif

View File

@ -0,0 +1,112 @@
#include <DataTypes/DataTypesNumber.h>
#include <DataTypes/DataTypeNullable.h>
#include <DataTypes/DataTypeArray.h>
#include <DataTypes/DataTypeTuple.h>
#include <DataTypes/DataTypeMap.h>
#include <DataTypes/DataTypeLowCardinality.h>
#include <Formats/ReadSchemaUtils.h>
#include <Processors/Formats/ISchemaReader.h>
#include <Common/assert_cast.h>
namespace DB
{
namespace ErrorCodes
{
extern const int CANNOT_EXTRACT_TABLE_STRUCTURE;
extern const int BAD_ARGUMENTS;
}
ColumnsDescription readSchemaFromFormat(const String & format_name, const std::optional<FormatSettings> & format_settings, ReadBufferCreator read_buffer_creator, ContextPtr context)
{
NamesAndTypesList names_and_types;
if (FormatFactory::instance().checkIfFormatHasExternalSchemaReader(format_name))
{
auto external_schema_reader = FormatFactory::instance().getExternalSchemaReader(format_name, context, format_settings);
try
{
names_and_types = external_schema_reader->readSchema();
}
catch (const DB::Exception & e)
{
throw Exception(ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, "Cannot extract table structure from {} format file. Error: {}", format_name, e.message());
}
}
else if (FormatFactory::instance().checkIfFormatHasSchemaReader(format_name))
{
auto read_buf = read_buffer_creator();
if (read_buf->eof())
throw Exception(ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, "Cannot extract table structure from {} format file, file is empty", format_name);
auto schema_reader = FormatFactory::instance().getSchemaReader(format_name, *read_buf, context, format_settings);
try
{
names_and_types = schema_reader->readSchema();
}
catch (const DB::Exception & e)
{
throw Exception(ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, "Cannot extract table structure from {} format file. Error: {}", format_name, e.message());
}
}
else
throw Exception(ErrorCodes::BAD_ARGUMENTS, "{} file format doesn't support schema inference", format_name);
return ColumnsDescription(names_and_types);
}
DataTypePtr generalizeDataType(DataTypePtr type)
{
WhichDataType which(type);
if (which.isNothing())
return nullptr;
if (which.isNullable())
{
const auto * nullable_type = assert_cast<const DataTypeNullable *>(type.get());
return generalizeDataType(nullable_type->getNestedType());
}
if (isNumber(type))
return makeNullable(std::make_shared<DataTypeFloat64>());
if (which.isArray())
{
const auto * array_type = assert_cast<const DataTypeArray *>(type.get());
auto nested_type = generalizeDataType(array_type->getNestedType());
return nested_type ? std::make_shared<DataTypeArray>(nested_type) : nullptr;
}
if (which.isTuple())
{
const auto * tuple_type = assert_cast<const DataTypeTuple *>(type.get());
DataTypes nested_types;
for (const auto & element : tuple_type->getElements())
{
auto nested_type = generalizeDataType(element);
if (!nested_type)
return nullptr;
nested_types.push_back(nested_type);
}
return std::make_shared<DataTypeTuple>(std::move(nested_types));
}
if (which.isMap())
{
const auto * map_type = assert_cast<const DataTypeMap *>(type.get());
auto key_type = removeNullable(generalizeDataType(map_type->getKeyType()));
auto value_type = generalizeDataType(map_type->getValueType());
return key_type && value_type ? std::make_shared<DataTypeMap>(key_type, value_type) : nullptr;
}
if (which.isLowCarnality())
{
const auto * lc_type = assert_cast<const DataTypeLowCardinality *>(type.get());
auto nested_type = generalizeDataType(lc_type->getDictionaryType());
return nested_type ? std::make_shared<DataTypeLowCardinality>(nested_type) : nullptr;
}
return makeNullable(type);
}
}

View File

@ -0,0 +1,30 @@
#pragma once
#include <Storages/ColumnsDescription.h>
#include <Formats/FormatFactory.h>
namespace DB
{
/// Try to determine the schema of the data in specifying format.
/// For formats that have an external schema reader, it will
/// use it and won't create a read buffer.
/// For formats that have a schema reader from the data,
/// read buffer will be created by the provided creator and
/// the schema will be extracted from the data.
/// If format doesn't have any schema reader or a schema reader
/// couldn't determine the schema, an exception will be thrown.
using ReadBufferCreator = std::function<std::unique_ptr<ReadBuffer>()>;
ColumnsDescription readSchemaFromFormat(const String & format_name, const std::optional<FormatSettings> & format_settings, ReadBufferCreator read_buffer_creator, ContextPtr context);
/// Convert type to the most general type:
/// - IntN, UIntN, FloatN, Decimal -> Float64
/// - Type -> Nullable(type)
/// - Array(Type) -> Array(Nullable(Type))
/// - Tuple(Type1, ..., TypeN) -> Tuple(Nullable(Type1), ..., Nullable(TypeN))
/// - Map(KeyType, ValueType) -> Map(KeyType, Nullable(ValueType))
/// - LowCardinality(Type) -> LowCardinality(Nullable(Type))
/// If type is Nothing or one of the nested types is Nothing, return nullptr.
DataTypePtr generalizeDataType(DataTypePtr type);
}

View File

@ -10,4 +10,3 @@
#cmakedefine01 USE_ARROW
#cmakedefine01 USE_PROTOBUF
#cmakedefine01 USE_MSGPACK

View File

@ -81,6 +81,28 @@ void registerInputFormatCapnProto(FormatFactory & factory);
void registerNonTrivialPrefixAndSuffixCheckerJSONEachRow(FormatFactory & factory);
void registerNonTrivialPrefixAndSuffixCheckerJSONAsString(FormatFactory & factory);
void registerArrowSchemaReader(FormatFactory & factory);
void registerParquetSchemaReader(FormatFactory & factory);
void registerORCSchemaReader(FormatFactory & factory);
void registerTSVSchemaReader(FormatFactory & factory);
void registerCSVSchemaReader(FormatFactory & factory);
void registerJSONCompactEachRowSchemaReader(FormatFactory & factory);
void registerJSONEachRowSchemaReader(FormatFactory & factory);
void registerNativeSchemaReader(FormatFactory & factory);
void registerRowBinaryWithNamesAndTypesSchemaReader(FormatFactory & factory);
void registerAvroSchemaReader(FormatFactory & factory);
void registerProtobufSchemaReader(FormatFactory & factory);
void registerLineAsStringSchemaReader(FormatFactory & factory);
void registerJSONAsStringSchemaReader(FormatFactory & factory);
void registerRawBLOBSchemaReader(FormatFactory & factory);
void registerMsgPackSchemaReader(FormatFactory & factory);
void registerCapnProtoSchemaReader(FormatFactory & factory);
void registerCustomSeparatedSchemaReader(FormatFactory & factory);
void registerRegexpSchemaReader(FormatFactory & factory);
void registerTSKVSchemaReader(FormatFactory & factory);
void registerValuesSchemaReader(FormatFactory & factory);
void registerTemplateSchemaReader(FormatFactory & factory);
void registerFormats()
{
auto & factory = FormatFactory::instance();
@ -152,6 +174,28 @@ void registerFormats()
registerNonTrivialPrefixAndSuffixCheckerJSONEachRow(factory);
registerNonTrivialPrefixAndSuffixCheckerJSONAsString(factory);
registerArrowSchemaReader(factory);
registerParquetSchemaReader(factory);
registerORCSchemaReader(factory);
registerTSVSchemaReader(factory);
registerCSVSchemaReader(factory);
registerJSONCompactEachRowSchemaReader(factory);
registerJSONEachRowSchemaReader(factory);
registerNativeSchemaReader(factory);
registerRowBinaryWithNamesAndTypesSchemaReader(factory);
registerAvroSchemaReader(factory);
registerProtobufSchemaReader(factory);
registerLineAsStringSchemaReader(factory);
registerJSONAsStringSchemaReader(factory);
registerRawBLOBSchemaReader(factory);
registerMsgPackSchemaReader(factory);
registerCapnProtoSchemaReader(factory);
registerCustomSeparatedSchemaReader(factory);
registerRegexpSchemaReader(factory);
registerTSKVSchemaReader(factory);
registerValuesSchemaReader(factory);
registerTemplateSchemaReader(factory);
}
}

View File

@ -1835,6 +1835,8 @@ public:
size_t getNumberOfArguments() const override { return 0; }
bool useDefaultImplementationForConstants() const override { return true; }
bool canBeExecutedOnDefaultArguments() const override { return false; }
ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {1}; }
DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override

View File

@ -39,7 +39,7 @@ BrotliReadBuffer::BrotliReadBuffer(std::unique_ptr<ReadBuffer> in_, size_t buf_s
, in_data(nullptr)
, out_capacity(0)
, out_data(nullptr)
, eof(false)
, eof_flag(false)
{
}
@ -47,7 +47,7 @@ BrotliReadBuffer::~BrotliReadBuffer() = default;
bool BrotliReadBuffer::nextImpl()
{
if (eof)
if (eof_flag)
return false;
if (!in_available)
@ -74,7 +74,7 @@ bool BrotliReadBuffer::nextImpl()
{
if (in->eof())
{
eof = true;
eof_flag = true;
return !working_buffer.empty();
}
else

View File

@ -32,7 +32,7 @@ private:
size_t out_capacity;
uint8_t * out_data;
bool eof;
bool eof_flag;
};
}

View File

@ -42,7 +42,7 @@ Bzip2ReadBuffer::Bzip2ReadBuffer(std::unique_ptr<ReadBuffer> in_, size_t buf_siz
: BufferWithOwnMemory<ReadBuffer>(buf_size, existing_memory, alignment)
, in(std::move(in_))
, bz(std::make_unique<Bzip2StateWrapper>())
, eof(false)
, eof_flag(false)
{
}
@ -50,7 +50,7 @@ Bzip2ReadBuffer::~Bzip2ReadBuffer() = default;
bool Bzip2ReadBuffer::nextImpl()
{
if (eof)
if (eof_flag)
return false;
if (!bz->stream.avail_in)
@ -72,7 +72,7 @@ bool Bzip2ReadBuffer::nextImpl()
{
if (in->eof())
{
eof = true;
eof_flag = true;
return !working_buffer.empty();
}
else
@ -91,7 +91,7 @@ bool Bzip2ReadBuffer::nextImpl()
if (in->eof())
{
eof = true;
eof_flag = true;
throw Exception(ErrorCodes::UNEXPECTED_END_OF_FILE, "Unexpected end of bzip2 archive");
}

View File

@ -26,7 +26,7 @@ private:
class Bzip2StateWrapper;
std::unique_ptr<Bzip2StateWrapper> bz;
bool eof;
bool eof_flag;
};
}

View File

@ -7,7 +7,7 @@ namespace ErrorCodes
extern const int LZMA_STREAM_DECODER_FAILED;
}
LZMAInflatingReadBuffer::LZMAInflatingReadBuffer(std::unique_ptr<ReadBuffer> in_, size_t buf_size, char * existing_memory, size_t alignment)
: BufferWithOwnMemory<ReadBuffer>(buf_size, existing_memory, alignment), in(std::move(in_)), eof(false)
: BufferWithOwnMemory<ReadBuffer>(buf_size, existing_memory, alignment), in(std::move(in_)), eof_flag(false)
{
lstr = LZMA_STREAM_INIT;
lstr.allocator = nullptr;
@ -36,7 +36,7 @@ LZMAInflatingReadBuffer::~LZMAInflatingReadBuffer()
bool LZMAInflatingReadBuffer::nextImpl()
{
if (eof)
if (eof_flag)
return false;
lzma_action action = LZMA_RUN;
@ -64,7 +64,7 @@ bool LZMAInflatingReadBuffer::nextImpl()
{
if (in->eof())
{
eof = true;
eof_flag = true;
return !working_buffer.empty();
}
else

View File

@ -25,7 +25,7 @@ private:
std::unique_ptr<ReadBuffer> in;
lzma_stream lstr;
bool eof;
bool eof_flag;
};
}

View File

@ -32,7 +32,7 @@ Lz4InflatingReadBuffer::~Lz4InflatingReadBuffer()
bool Lz4InflatingReadBuffer::nextImpl()
{
if (eof)
if (eof_flag)
return false;
if (!in_available)
@ -66,7 +66,7 @@ bool Lz4InflatingReadBuffer::nextImpl()
if (in->eof())
{
eof = true;
eof_flag = true;
return !working_buffer.empty();
}

View File

@ -35,7 +35,7 @@ private:
size_t in_available;
size_t out_available;
bool eof = false;
bool eof_flag = false;
};
}

View File

@ -702,6 +702,25 @@ void readCSVString(String & s, ReadBuffer & buf, const FormatSettings::CSV & set
readCSVStringInto(s, buf, settings);
}
void readCSVField(String & s, ReadBuffer & buf, const FormatSettings::CSV & settings)
{
s.clear();
bool add_quote = false;
char quote = '\'';
if (!buf.eof() && (*buf.position() == '\'' || *buf.position() == '"'))
{
quote = *buf.position();
s.push_back(quote);
add_quote = true;
}
readCSVStringInto(s, buf, settings);
if (add_quote)
s.push_back(quote);
}
template void readCSVStringInto<PaddedPODArray<UInt8>>(PaddedPODArray<UInt8> & s, ReadBuffer & buf, const FormatSettings::CSV & settings);
@ -1212,6 +1231,19 @@ void skipToNextRowOrEof(PeekableReadBuffer & buf, const String & row_after_delim
}
}
// Use PeekableReadBuffer to copy field to string after parsing.
template <typename ParseFunc>
static void readParsedValueIntoString(String & s, ReadBuffer & buf, ParseFunc parse_func)
{
PeekableReadBuffer peekable_buf(buf);
peekable_buf.setCheckpoint();
parse_func(peekable_buf);
peekable_buf.makeContinuousMemoryFromCheckpointToPos();
auto * end = peekable_buf.position();
peekable_buf.rollbackToCheckpoint();
s.append(peekable_buf.position(), end);
peekable_buf.position() = end;
}
template <char opening_bracket, char closing_bracket>
static void readQuotedFieldInBrackets(String & s, ReadBuffer & buf)
@ -1266,7 +1298,11 @@ void readQuotedFieldIntoString(String & s, ReadBuffer & buf)
/// - Number: integer, float, decimal.
if (*buf.position() == '\'')
readQuotedString(s, buf);
{
s.push_back('\'');
readQuotedStringInto<false>(s, buf);
s.push_back('\'');
}
else if (*buf.position() == '[')
readQuotedFieldInBrackets<'[', ']'>(s, buf);
else if (*buf.position() == '(')
@ -1290,18 +1326,19 @@ void readQuotedFieldIntoString(String & s, ReadBuffer & buf)
else
{
/// It's an integer, float or decimal. They all can be parsed as float.
/// Use PeekableReadBuffer to copy field to string after parsing.
PeekableReadBuffer peekable_buf(buf);
peekable_buf.setCheckpoint();
Float64 tmp;
readFloatText(tmp, peekable_buf);
peekable_buf.makeContinuousMemoryFromCheckpointToPos();
auto * end = peekable_buf.position();
peekable_buf.rollbackToCheckpoint();
s.append(peekable_buf.position(), end);
peekable_buf.position() = end;
auto parse_func = [](ReadBuffer & in)
{
Float64 tmp;
readFloatText(tmp, in);
};
readParsedValueIntoString(s, buf, parse_func);
}
}
void readJSONFieldIntoString(String & s, ReadBuffer & buf)
{
auto parse_func = [](ReadBuffer & in) { skipJSONField(in, "json_field"); };
readParsedValueIntoString(s, buf, parse_func);
}
}

View File

@ -563,6 +563,8 @@ void readStringUntilWhitespace(String & s, ReadBuffer & buf);
*/
void readCSVString(String & s, ReadBuffer & buf, const FormatSettings::CSV & settings);
/// Differ from readCSVString in that it doesn't remove quotes around field if any.
void readCSVField(String & s, ReadBuffer & buf, const FormatSettings::CSV & settings);
/// Read and append result to array of characters.
template <typename Vector>
@ -1381,4 +1383,7 @@ struct PcgDeserializer
void readQuotedFieldIntoString(String & s, ReadBuffer & buf);
void readJSONFieldIntoString(String & s, ReadBuffer & buf);
}

View File

@ -16,7 +16,7 @@ ZlibInflatingReadBuffer::ZlibInflatingReadBuffer(
size_t alignment)
: BufferWithOwnMemory<ReadBuffer>(buf_size, existing_memory, alignment)
, in(std::move(in_))
, eof(false)
, eof_flag(false)
{
zstr.zalloc = nullptr;
zstr.zfree = nullptr;
@ -54,7 +54,7 @@ bool ZlibInflatingReadBuffer::nextImpl()
do
{
/// if we already found eof, we shouldn't do anything
if (eof)
if (eof_flag)
return false;
/// if there is no available bytes in zstr, move ptr to next available data
@ -83,7 +83,7 @@ bool ZlibInflatingReadBuffer::nextImpl()
/// * false if there is no data in working buffer
if (in->eof())
{
eof = true;
eof_flag = true;
return !working_buffer.empty();
}
/// If it is not end of file, we need to reset zstr and return true, because we still have some data to read

View File

@ -33,7 +33,7 @@ private:
std::unique_ptr<ReadBuffer> in;
z_stream zstr;
bool eof;
bool eof_flag;
};
}

View File

@ -31,7 +31,7 @@ bool ZstdInflatingReadBuffer::nextImpl()
do
{
// If it is known that end of file was reached, return false
if (eof)
if (eof_flag)
return false;
/// If end was reached, get next part
@ -64,7 +64,7 @@ bool ZstdInflatingReadBuffer::nextImpl()
/// If end of file is reached, fill eof variable and return true if there is some data in buffer, otherwise return false
if (in->eof())
{
eof = true;
eof_flag = true;
return !working_buffer.empty();
}
/// It is possible, that input buffer is not at eof yet, but nothing was decompressed in current iteration.

View File

@ -31,7 +31,7 @@ private:
ZSTD_DCtx * dctx;
ZSTD_inBuffer input;
ZSTD_outBuffer output;
bool eof = false;
bool eof_flag = false;
};
}

View File

@ -637,13 +637,14 @@ InterpreterCreateQuery::TableProperties InterpreterCreateQuery::getTableProperti
/// Table function without columns list.
auto table_function = TableFunctionFactory::instance().get(create.as_table_function, getContext());
properties.columns = table_function->getActualTableStructure(getContext());
assert(!properties.columns.empty());
}
else if (create.is_dictionary)
{
return {};
}
else
/// We can have queries like "CREATE TABLE <table> ENGINE=<engine>" if <engine>
/// supports schema inference (will determine table structure in it's constructor).
else if (!StorageFactory::instance().checkIfStorageSupportsSchemaInterface(create.storage->engine->name))
throw Exception("Incorrect CREATE query: required list of column descriptions or AS section or SELECT.", ErrorCodes::INCORRECT_QUERY);
/// Even if query has list of columns, canonicalize it (unfold Nested columns).
@ -1083,7 +1084,10 @@ bool InterpreterCreateQuery::doCreateTable(ASTCreateQuery & create,
{
const auto & factory = TableFunctionFactory::instance();
auto table_func = factory.get(create.as_table_function, getContext());
res = table_func->execute(create.as_table_function, getContext(), create.getTable(), properties.columns);
/// In case of CREATE AS table_function() query we should use global context
/// in storage creation because there will be no query context on server startup
/// and because storage lifetime is bigger than query context lifetime.
res = table_func->execute(create.as_table_function, getContext(), create.getTable(), properties.columns, /*use_global_context=*/true);
res->renameInMemory({create.getDatabase(), create.getTable(), create.uuid});
}
else

View File

@ -359,7 +359,7 @@ void ASTCreateQuery::formatQueryImpl(const FormatSettings & settings, FormatStat
if (as_table_function)
{
if (columns_list)
if (columns_list && !columns_list->empty())
{
frame.expression_list_always_start_on_new_line = true;
settings.ostr << (settings.one_line ? " (" : "\n(");
@ -375,7 +375,7 @@ void ASTCreateQuery::formatQueryImpl(const FormatSettings & settings, FormatStat
frame.expression_list_always_start_on_new_line = true;
if (columns_list && !as_table_function)
if (columns_list && !columns_list->empty() && !as_table_function)
{
settings.ostr << (settings.one_line ? " (" : "\n(");
FormatStateStacked frame_nested = frame;

View File

@ -50,6 +50,12 @@ public:
ASTPtr clone() const override;
void formatImpl(const FormatSettings & s, FormatState & state, FormatStateStacked frame) const override;
bool empty()
{
return (!columns || columns->children.empty()) && (!indices || indices->children.empty()) && (!constraints || constraints->children.empty())
&& (!projections || projections->children.empty());
}
};

View File

@ -557,34 +557,43 @@ bool ParserCreateTableQuery::parseImpl(Pos & pos, ASTPtr & node, Expected & expe
}
}
}
/** Create queries without list of columns:
* - CREATE|ATTACH TABLE ... AS ...
* - CREATE|ATTACH TABLE ... ENGINE = engine
*/
else
{
storage_p.parse(pos, storage, expected);
if (!s_as.ignore(pos, expected))
return false;
if (!select_p.parse(pos, select, expected)) /// AS SELECT ...
/// CREATE|ATTACH TABLE ... AS ...
if (s_as.ignore(pos, expected))
{
/// ENGINE can not be specified for table functions.
if (storage || !table_function_p.parse(pos, as_table_function, expected))
if (!select_p.parse(pos, select, expected)) /// AS SELECT ...
{
/// AS [db.]table
if (!name_p.parse(pos, as_table, expected))
return false;
if (s_dot.ignore(pos, expected))
/// ENGINE can not be specified for table functions.
if (storage || !table_function_p.parse(pos, as_table_function, expected))
{
as_database = as_table;
/// AS [db.]table
if (!name_p.parse(pos, as_table, expected))
return false;
}
/// Optional - ENGINE can be specified.
if (!storage)
storage_p.parse(pos, storage, expected);
if (s_dot.ignore(pos, expected))
{
as_database = as_table;
if (!name_p.parse(pos, as_table, expected))
return false;
}
/// Optional - ENGINE can be specified.
if (!storage)
storage_p.parse(pos, storage, expected);
}
}
}
else if (!storage)
{
return false;
}
}
auto comment = parseComment(pos, expected);

View File

@ -361,6 +361,8 @@ protected:
* Or:
* CREATE|ATTACH TABLE [IF NOT EXISTS] [db.]name [UUID 'uuid'] [ON CLUSTER cluster] AS ENGINE = engine SELECT ...
*
* Or (for engines that supports schema inference):
* CREATE|ATTACH TABLE [IF NOT EXISTS] [db.]name [UUID 'uuid'] [ON CLUSTER cluster] ENGINE = engine
*/
class ParserCreateTableQuery : public IParserBase
{

View File

@ -0,0 +1,160 @@
#include <Processors/Formats/ISchemaReader.h>
#include <Formats/ReadSchemaUtils.h>
#include <DataTypes/DataTypeString.h>
namespace DB
{
namespace ErrorCodes
{
extern const int CANNOT_EXTRACT_TABLE_STRUCTURE;
}
IRowSchemaReader::IRowSchemaReader(ReadBuffer & in_, size_t max_rows_to_read_, DataTypePtr default_type_)
: ISchemaReader(in_), max_rows_to_read(max_rows_to_read_), default_type(default_type_)
{
}
NamesAndTypesList IRowSchemaReader::readSchema()
{
DataTypes data_types = readRowAndGetDataTypes();
for (size_t row = 1; row < max_rows_to_read; ++row)
{
DataTypes new_data_types = readRowAndGetDataTypes();
if (new_data_types.empty())
/// We reached eof.
break;
if (new_data_types.size() != data_types.size())
throw Exception(ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, "Rows have different amount of values");
for (size_t i = 0; i != data_types.size(); ++i)
{
/// We couldn't determine the type of this column in a new row, just skip it.
if (!new_data_types[i])
continue;
/// If we couldn't determine the type of column yet, just set the new type.
if (!data_types[i])
data_types[i] = new_data_types[i];
/// If the new type and the previous type for this column are different,
/// we will use default type if we have it or throw an exception.
else if (data_types[i]->getName() != new_data_types[i]->getName())
{
if (default_type)
data_types[i] = default_type;
else
throw Exception(
ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE,
"Automatically defined type {} for column {} in row {} differs from type defined by previous rows: {}", new_data_types[i]->getName(), i + 1, row, data_types[i]->getName());
}
}
}
/// Check that we read at list one column.
if (data_types.empty())
throw Exception(ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, "Cannot read rows from the data");
/// If column names weren't set, use default names 'c1', 'c2', ...
if (column_names.empty())
{
column_names.reserve(data_types.size());
for (size_t i = 0; i != data_types.size(); ++i)
column_names.push_back("c" + std::to_string(i + 1));
}
/// If column names were set, check that the number of names match the number of types.
else if (column_names.size() != data_types.size())
throw Exception(
ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE,
"The number of column names {} differs with the number of types {}", column_names.size(), data_types.size());
NamesAndTypesList result;
for (size_t i = 0; i != data_types.size(); ++i)
{
/// Check that we could determine the type of this column.
if (!data_types[i])
{
if (!default_type)
throw Exception(
ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE,
"Cannot determine table structure by first {} rows of data, because some columns contain only Nulls. To increase the maximum "
"number of rows to read for structure determination, use setting input_format_max_rows_to_read_for_schema_inference",
max_rows_to_read);
data_types[i] = default_type;
}
result.emplace_back(column_names[i], data_types[i]);
}
return result;
}
IRowWithNamesSchemaReader::IRowWithNamesSchemaReader(ReadBuffer & in_, size_t max_rows_to_read_, DataTypePtr default_type_)
: ISchemaReader(in_), max_rows_to_read(max_rows_to_read_), default_type(default_type_)
{
}
NamesAndTypesList IRowWithNamesSchemaReader::readSchema()
{
auto names_and_types = readRowAndGetNamesAndDataTypes();
for (size_t row = 1; row < max_rows_to_read; ++row)
{
auto new_names_and_types = readRowAndGetNamesAndDataTypes();
if (new_names_and_types.empty())
/// We reached eof.
break;
for (const auto & [name, new_type] : new_names_and_types)
{
auto it = names_and_types.find(name);
/// If we didn't see this column before, just add it.
if (it == names_and_types.end())
{
names_and_types[name] = new_type;
continue;
}
auto & type = it->second;
/// If we couldn't determine the type of column yet, just set the new type.
if (!type)
type = new_type;
/// If the new type and the previous type for this column are different,
/// we will use default type if we have it or throw an exception.
else if (new_type && type->getName() != new_type->getName())
{
if (default_type)
type = default_type;
else
throw Exception(
ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE,
"Automatically defined type {} for column {} in row {} differs from type defined by previous rows: {}", type->getName(), name, row, new_type->getName());
}
}
}
/// Check that we read at list one column.
if (names_and_types.empty())
throw Exception(ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, "Cannot read rows from the data");
NamesAndTypesList result;
for (auto & [name, type] : names_and_types)
{
/// Check that we could determine the type of this column.
if (!type)
{
if (!default_type)
throw Exception(
ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE,
"Cannot determine table structure by first {} rows of data, because some columns contain only Nulls. To increase the maximum "
"number of rows to read for structure determination, use setting input_format_max_rows_to_read_for_schema_inference",
max_rows_to_read);
type = default_type;
}
result.emplace_back(name, type);
}
return result;
}
}

View File

@ -0,0 +1,87 @@
#pragma once
#include <Core/NamesAndTypes.h>
#include <DataTypes/IDataType.h>
#include <Formats/FormatSettings.h>
#include <IO/ReadBuffer.h>
namespace DB
{
/// Base class for schema inference for the data in some specific format.
/// It reads some data from read buffer and try to determine the schema
/// from read data.
class ISchemaReader
{
public:
ISchemaReader(ReadBuffer & in_) : in(in_) {}
virtual NamesAndTypesList readSchema() = 0;
virtual ~ISchemaReader() = default;
protected:
ReadBuffer & in;
};
/// Base class for schema inference for formats that read data row by row.
/// It reads data row by row (up to max_rows_to_read), determines types of columns
/// for each row and compare them with types from the previous rows. If some column
/// contains values with different types in different rows, the default type will be
/// used for this column or the exception will be thrown (if default type is not set).
class IRowSchemaReader : public ISchemaReader
{
public:
IRowSchemaReader(ReadBuffer & in_, size_t max_rows_to_read_, DataTypePtr default_type_ = nullptr);
NamesAndTypesList readSchema() override;
protected:
/// Read one row and determine types of columns in it.
/// Return types in the same order in which the values were in the row.
/// If it's impossible to determine the type for some column, return nullptr for it.
/// Return empty list if can't read more data.
virtual DataTypes readRowAndGetDataTypes() = 0;
void setColumnNames(const std::vector<String> & names) { column_names = names; }
private:
size_t max_rows_to_read;
DataTypePtr default_type;
std::vector<String> column_names;
};
/// Base class for schema inference for formats that read data row by row and each
/// row contains column names and values (ex: JSONEachRow, TSKV).
/// Differ from IRowSchemaReader in that after reading a row we get
/// a map {column_name : type} and some columns may be missed in a single row
/// (in this case we will use types from the previous rows for missed columns).
class IRowWithNamesSchemaReader : public ISchemaReader
{
public:
IRowWithNamesSchemaReader(ReadBuffer & in_, size_t max_rows_to_read_, DataTypePtr default_type_ = nullptr);
NamesAndTypesList readSchema() override;
protected:
/// Read one row and determine types of columns in it.
/// Return map {column_name : type}.
/// If it's impossible to determine the type for some column, return nullptr for it.
/// Return empty map is can't read more data.
virtual std::unordered_map<String, DataTypePtr> readRowAndGetNamesAndDataTypes() = 0;
private:
size_t max_rows_to_read;
DataTypePtr default_type;
};
/// Base class for schema inference for formats that don't need any data to
/// determine the schema: formats with constant schema (ex: JSONAsString, LineAsString)
/// and formats that use external format schema (ex: Protobuf, CapnProto).
class IExternalSchemaReader
{
public:
virtual NamesAndTypesList readSchema() = 0;
virtual ~IExternalSchemaReader() = default;
};
}

View File

@ -85,31 +85,38 @@ void ArrowBlockInputFormat::resetParser()
record_batch_current = 0;
}
static std::shared_ptr<arrow::RecordBatchReader> createStreamReader(ReadBuffer & in)
{
auto stream_reader_status = arrow::ipc::RecordBatchStreamReader::Open(std::make_unique<ArrowInputStreamFromReadBuffer>(in));
if (!stream_reader_status.ok())
throw Exception(ErrorCodes::UNKNOWN_EXCEPTION,
"Error while opening a table: {}", stream_reader_status.status().ToString());
return *stream_reader_status;
}
static std::shared_ptr<arrow::ipc::RecordBatchFileReader> createFileReader(ReadBuffer & in, const FormatSettings & format_settings, std::atomic<int> & is_stopped)
{
auto arrow_file = asArrowFile(in, format_settings, is_stopped);
if (is_stopped)
return nullptr;
auto file_reader_status = arrow::ipc::RecordBatchFileReader::Open(std::move(arrow_file));
if (!file_reader_status.ok())
throw Exception(ErrorCodes::UNKNOWN_EXCEPTION,
"Error while opening a table: {}", file_reader_status.status().ToString());
return *file_reader_status;
}
void ArrowBlockInputFormat::prepareReader()
{
std::shared_ptr<arrow::Schema> schema;
if (stream)
{
auto stream_reader_status = arrow::ipc::RecordBatchStreamReader::Open(std::make_unique<ArrowInputStreamFromReadBuffer>(*in));
if (!stream_reader_status.ok())
throw Exception(ErrorCodes::UNKNOWN_EXCEPTION,
"Error while opening a table: {}", stream_reader_status.status().ToString());
stream_reader = *stream_reader_status;
schema = stream_reader->schema();
}
stream_reader = createStreamReader(*in);
else
{
auto arrow_file = asArrowFile(*in, format_settings, is_stopped);
if (is_stopped)
file_reader = createFileReader(*in, format_settings, is_stopped);
if (!file_reader)
return;
auto file_reader_status = arrow::ipc::RecordBatchFileReader::Open(std::move(arrow_file));
if (!file_reader_status.ok())
throw Exception(ErrorCodes::UNKNOWN_EXCEPTION,
"Error while opening a table: {}", file_reader_status.status().ToString());
file_reader = *file_reader_status;
schema = file_reader->schema();
}
arrow_column_to_ch_column = std::make_unique<ArrowColumnToCHColumn>(getPort().getHeader(), "Arrow", format_settings.arrow.import_nested);
@ -122,6 +129,27 @@ void ArrowBlockInputFormat::prepareReader()
record_batch_current = 0;
}
ArrowSchemaReader::ArrowSchemaReader(ReadBuffer & in_, bool stream_, const FormatSettings & format_settings_)
: ISchemaReader(in_), stream(stream_), format_settings(format_settings_)
{
}
NamesAndTypesList ArrowSchemaReader::readSchema()
{
std::shared_ptr<arrow::Schema> schema;
if (stream)
schema = createStreamReader(in)->schema();
else
{
std::atomic<int> is_stopped = 0;
schema = createFileReader(in, format_settings, is_stopped)->schema();
}
auto header = ArrowColumnToCHColumn::arrowSchemaToCHHeader(*schema, stream ? "ArrowStream" : "Arrow");
return header.getNamesAndTypesList();
}
void registerInputFormatArrow(FormatFactory & factory)
{
factory.registerInputFormat(
@ -145,6 +173,20 @@ void registerInputFormatArrow(FormatFactory & factory)
});
}
void registerArrowSchemaReader(FormatFactory & factory)
{
factory.registerSchemaReader(
"Arrow",
[](ReadBuffer & buf, const FormatSettings & settings, ContextPtr)
{
return std::make_shared<ArrowSchemaReader>(buf, false, settings);
});
factory.registerSchemaReader(
"ArrowStream",
[](ReadBuffer & buf, const FormatSettings & settings, ContextPtr)
{
return std::make_shared<ArrowSchemaReader>(buf, true, settings);
});}
}
#else
@ -154,6 +196,8 @@ class FormatFactory;
void registerInputFormatArrow(FormatFactory &)
{
}
void registerArrowSchemaReader(FormatFactory &) {}
}
#endif

View File

@ -4,6 +4,7 @@
#if USE_ARROW
#include <Processors/Formats/IInputFormat.h>
#include <Processors/Formats/ISchemaReader.h>
#include <Formats/FormatSettings.h>
namespace arrow { class RecordBatchReader; }
@ -51,6 +52,18 @@ private:
std::atomic<int> is_stopped{0};
};
class ArrowSchemaReader : public ISchemaReader
{
public:
ArrowSchemaReader(ReadBuffer & in_, bool stream_, const FormatSettings & format_settings_);
NamesAndTypesList readSchema() override;
private:
bool stream;
const FormatSettings format_settings;
};
}
#endif

View File

@ -239,10 +239,8 @@ static ColumnWithTypeAndName readColumnWithTimestampData(std::shared_ptr<arrow::
}
template <typename DecimalType, typename DecimalArray>
static ColumnWithTypeAndName readColumnWithDecimalData(std::shared_ptr<arrow::ChunkedArray> & arrow_column, const String & column_name)
static ColumnWithTypeAndName readColumnWithDecimalDataImpl(std::shared_ptr<arrow::ChunkedArray> & arrow_column, const String & column_name, DataTypePtr internal_type)
{
const auto * arrow_decimal_type = static_cast<arrow::DecimalType *>(arrow_column->type().get());
auto internal_type = std::make_shared<DataTypeDecimal<DecimalType>>(arrow_decimal_type->precision(), arrow_decimal_type->scale());
auto internal_column = internal_type->createColumn();
auto & column = assert_cast<ColumnDecimal<DecimalType> &>(*internal_column);
auto & column_data = column.getData();
@ -259,6 +257,21 @@ static ColumnWithTypeAndName readColumnWithDecimalData(std::shared_ptr<arrow::Ch
return {std::move(internal_column), std::move(internal_type), column_name};
}
template <typename DecimalArray>
static ColumnWithTypeAndName readColumnWithDecimalData(std::shared_ptr<arrow::ChunkedArray> & arrow_column, const String & column_name)
{
const auto * arrow_decimal_type = static_cast<arrow::DecimalType *>(arrow_column->type().get());
size_t precision = arrow_decimal_type->precision();
auto internal_type = createDecimal<DataTypeDecimal>(precision, arrow_decimal_type->scale());
if (precision <= DecimalUtils::max_precision<Decimal32>)
return readColumnWithDecimalDataImpl<Decimal32, DecimalArray>(arrow_column, column_name, internal_type);
else if (precision <= DecimalUtils::max_precision<Decimal64>)
return readColumnWithDecimalDataImpl<Decimal64, DecimalArray>(arrow_column, column_name, internal_type);
else if (precision <= DecimalUtils::max_precision<Decimal128>)
return readColumnWithDecimalDataImpl<Decimal128, DecimalArray>(arrow_column, column_name, internal_type);
return readColumnWithDecimalDataImpl<Decimal256, DecimalArray>(arrow_column, column_name, internal_type);
}
/// Creates a null bytemap from arrow's null bitmap
static ColumnPtr readByteMapFromArrowColumn(std::shared_ptr<arrow::ChunkedArray> & arrow_column)
{
@ -328,12 +341,13 @@ static ColumnWithTypeAndName readColumnFromArrowColumn(
const std::string & column_name,
const std::string & format_name,
bool is_nullable,
std::unordered_map<String, std::shared_ptr<ColumnWithTypeAndName>> & dictionary_values)
std::unordered_map<String, std::shared_ptr<ColumnWithTypeAndName>> & dictionary_values,
bool read_ints_as_dates)
{
if (!is_nullable && arrow_column->null_count() && arrow_column->type()->id() != arrow::Type::LIST
&& arrow_column->type()->id() != arrow::Type::MAP && arrow_column->type()->id() != arrow::Type::STRUCT)
{
auto nested_column = readColumnFromArrowColumn(arrow_column, column_name, format_name, true, dictionary_values);
auto nested_column = readColumnFromArrowColumn(arrow_column, column_name, format_name, true, dictionary_values, read_ints_as_dates);
auto nullmap_column = readByteMapFromArrowColumn(arrow_column);
auto nullable_type = std::make_shared<DataTypeNullable>(std::move(nested_column.type));
auto nullable_column = ColumnNullable::create(std::move(nested_column.column), std::move(nullmap_column));
@ -358,25 +372,27 @@ static ColumnWithTypeAndName readColumnFromArrowColumn(
case arrow::Type::UINT16:
{
auto column = readColumnWithNumericData<UInt16>(arrow_column, column_name);
column.type = std::make_shared<DataTypeDate>();
if (read_ints_as_dates)
column.type = std::make_shared<DataTypeDate>();
return column;
}
case arrow::Type::UINT32:
{
auto column = readColumnWithNumericData<UInt32>(arrow_column, column_name);
column.type = std::make_shared<DataTypeDateTime>();
if (read_ints_as_dates)
column.type = std::make_shared<DataTypeDateTime>();
return column;
}
case arrow::Type::TIMESTAMP:
return readColumnWithTimestampData(arrow_column, column_name);
case arrow::Type::DECIMAL128:
return readColumnWithDecimalData<Decimal128, arrow::Decimal128Array>(arrow_column, column_name);
return readColumnWithDecimalData<arrow::Decimal128Array>(arrow_column, column_name);
case arrow::Type::DECIMAL256:
return readColumnWithDecimalData<Decimal256, arrow::Decimal256Array>(arrow_column, column_name);
return readColumnWithDecimalData<arrow::Decimal256Array>(arrow_column, column_name);
case arrow::Type::MAP:
{
auto arrow_nested_column = getNestedArrowColumn(arrow_column);
auto nested_column = readColumnFromArrowColumn(arrow_nested_column, column_name, format_name, false, dictionary_values);
auto nested_column = readColumnFromArrowColumn(arrow_nested_column, column_name, format_name, false, dictionary_values, read_ints_as_dates);
auto offsets_column = readOffsetsFromArrowListColumn(arrow_column);
const auto * tuple_column = assert_cast<const ColumnTuple *>(nested_column.column.get());
@ -388,7 +404,7 @@ static ColumnWithTypeAndName readColumnFromArrowColumn(
case arrow::Type::LIST:
{
auto arrow_nested_column = getNestedArrowColumn(arrow_column);
auto nested_column = readColumnFromArrowColumn(arrow_nested_column, column_name, format_name, false, dictionary_values);
auto nested_column = readColumnFromArrowColumn(arrow_nested_column, column_name, format_name, false, dictionary_values, read_ints_as_dates);
auto offsets_column = readOffsetsFromArrowListColumn(arrow_column);
auto array_column = ColumnArray::create(std::move(nested_column.column), std::move(offsets_column));
auto array_type = std::make_shared<DataTypeArray>(nested_column.type);
@ -413,7 +429,7 @@ static ColumnWithTypeAndName readColumnFromArrowColumn(
for (int i = 0; i != arrow_struct_type->num_fields(); ++i)
{
auto nested_arrow_column = std::make_shared<arrow::ChunkedArray>(nested_arrow_columns[i]);
auto element = readColumnFromArrowColumn(nested_arrow_column, arrow_struct_type->field(i)->name(), format_name, false, dictionary_values);
auto element = readColumnFromArrowColumn(nested_arrow_column, arrow_struct_type->field(i)->name(), format_name, false, dictionary_values, read_ints_as_dates);
tuple_elements.emplace_back(std::move(element.column));
tuple_types.emplace_back(std::move(element.type));
tuple_names.emplace_back(std::move(element.name));
@ -436,7 +452,7 @@ static ColumnWithTypeAndName readColumnFromArrowColumn(
dict_array.emplace_back(dict_chunk.dictionary());
}
auto arrow_dict_column = std::make_shared<arrow::ChunkedArray>(dict_array);
auto dict_column = readColumnFromArrowColumn(arrow_dict_column, column_name, format_name, false, dictionary_values);
auto dict_column = readColumnFromArrowColumn(arrow_dict_column, column_name, format_name, false, dictionary_values, read_ints_as_dates);
/// We should convert read column to ColumnUnique.
auto tmp_lc_column = DataTypeLowCardinality(dict_column.type).createColumn();
@ -483,7 +499,7 @@ static void checkStatus(const arrow::Status & status, const String & column_name
throw Exception{ErrorCodes::UNKNOWN_EXCEPTION, "Error with a {} column '{}': {}.", format_name, column_name, status.ToString()};
}
static Block arrowSchemaToCHHeader(const arrow::Schema & schema, const std::string & format_name)
Block ArrowColumnToCHColumn::arrowSchemaToCHHeader(const arrow::Schema & schema, const std::string & format_name)
{
ColumnsWithTypeAndName sample_columns;
for (const auto & field : schema.fields())
@ -493,24 +509,21 @@ static Block arrowSchemaToCHHeader(const arrow::Schema & schema, const std::stri
std::unique_ptr<arrow::ArrayBuilder> array_builder;
arrow::Status status = MakeBuilder(pool, field->type(), &array_builder);
checkStatus(status, field->name(), format_name);
std::shared_ptr<arrow::Array> arrow_array;
status = array_builder->Finish(&arrow_array);
checkStatus(status, field->name(), format_name);
arrow::ArrayVector array_vector = {arrow_array};
auto arrow_column = std::make_shared<arrow::ChunkedArray>(array_vector);
std::unordered_map<std::string, std::shared_ptr<ColumnWithTypeAndName>> dict_values;
ColumnWithTypeAndName sample_column = readColumnFromArrowColumn(arrow_column, field->name(), format_name, false, dict_values);
ColumnWithTypeAndName sample_column = readColumnFromArrowColumn(arrow_column, field->name(), format_name, false, dict_values, false);
sample_columns.emplace_back(std::move(sample_column));
}
return Block(std::move(sample_columns));
}
ArrowColumnToCHColumn::ArrowColumnToCHColumn(
const arrow::Schema & schema, const std::string & format_name_, bool import_nested_)
: header(arrowSchemaToCHHeader(schema, format_name_)), format_name(format_name_), import_nested(import_nested_)
{
}
ArrowColumnToCHColumn::ArrowColumnToCHColumn(
const Block & header_, const std::string & format_name_, bool import_nested_)
: header(header_), format_name(format_name_), import_nested(import_nested_)
@ -553,7 +566,7 @@ void ArrowColumnToCHColumn::arrowColumnsToCHChunk(Chunk & res, NameToColumnPtr &
if (!nested_tables.contains(nested_table_name))
{
std::shared_ptr<arrow::ChunkedArray> arrow_column = name_to_column_ptr[nested_table_name];
ColumnsWithTypeAndName cols = {readColumnFromArrowColumn(arrow_column, nested_table_name, format_name, false, dictionary_values)};
ColumnsWithTypeAndName cols = {readColumnFromArrowColumn(arrow_column, nested_table_name, format_name, false, dictionary_values, true)};
Block block(cols);
nested_tables[nested_table_name] = std::make_shared<Block>(Nested::flatten(block));
}
@ -573,7 +586,7 @@ void ArrowColumnToCHColumn::arrowColumnsToCHChunk(Chunk & res, NameToColumnPtr &
if (read_from_nested)
column = nested_tables[nested_table_name]->getByName(header_column.name);
else
column = readColumnFromArrowColumn(arrow_column, header_column.name, format_name, false, dictionary_values);
column = readColumnFromArrowColumn(arrow_column, header_column.name, format_name, false, dictionary_values, true);
try
{

View File

@ -23,16 +23,14 @@ public:
ArrowColumnToCHColumn(const Block & header_, const std::string & format_name_, bool import_nested_);
/// Constructor that create header by arrow schema. It will be useful for inserting
/// data from file without knowing table structure.
ArrowColumnToCHColumn(const arrow::Schema & schema, const std::string & format_name, bool import_nested_);
void arrowTableToCHChunk(Chunk & res, std::shared_ptr<arrow::Table> & table);
void arrowColumnsToCHChunk(Chunk & res, NameToColumnPtr & name_to_column_ptr);
static Block arrowSchemaToCHHeader(const arrow::Schema & schema, const std::string & format_name);
private:
const Block header;
const Block & header;
const std::string format_name;
bool import_nested;

View File

@ -815,6 +815,92 @@ const AvroDeserializer & AvroConfluentRowInputFormat::getOrCreateDeserializer(Sc
return it->second;
}
AvroSchemaReader::AvroSchemaReader(ReadBuffer & in_, bool confluent_, const FormatSettings & format_settings_)
: ISchemaReader(in_), confluent(confluent_), format_settings(format_settings_)
{
}
NamesAndTypesList AvroSchemaReader::readSchema()
{
avro::NodePtr root_node;
if (confluent)
{
UInt32 schema_id = readConfluentSchemaId(in);
root_node = getConfluentSchemaRegistry(format_settings)->getSchema(schema_id).root();
}
else
{
auto file_reader_ptr = std::make_unique<avro::DataFileReaderBase>(std::make_unique<InputStreamReadBufferAdapter>(in));
root_node = file_reader_ptr->dataSchema().root();
}
if (root_node->type() != avro::Type::AVRO_RECORD)
throw Exception("Root schema must be a record", ErrorCodes::TYPE_MISMATCH);
NamesAndTypesList names_and_types;
for (size_t i = 0; i != root_node->leaves(); ++i)
names_and_types.emplace_back(root_node->nameAt(i), avroNodeToDataType(root_node->leafAt(i)));
return names_and_types;
}
DataTypePtr AvroSchemaReader::avroNodeToDataType(avro::NodePtr node)
{
switch (node->type())
{
case avro::Type::AVRO_INT:
return {std::make_shared<DataTypeInt32>()};
case avro::Type::AVRO_LONG:
return std::make_shared<DataTypeInt64>();
case avro::Type::AVRO_BOOL:
return std::make_shared<DataTypeUInt8>();
case avro::Type::AVRO_FLOAT:
return std::make_shared<DataTypeFloat32>();
case avro::Type::AVRO_DOUBLE:
return std::make_shared<DataTypeFloat64>();
case avro::Type::AVRO_STRING:
return std::make_shared<DataTypeString>();
case avro::Type::AVRO_BYTES:
return std::make_shared<DataTypeFloat32>();
case avro::Type::AVRO_ENUM:
{
if (node->names() < 128)
{
EnumValues<Int8>::Values values;
for (size_t i = 0; i != node->names(); ++i)
values.emplace_back(node->nameAt(i), i);
return std::make_shared<DataTypeEnum8>(std::move(values));
}
else if (node->names() < 32768)
{
EnumValues<Int16>::Values values;
for (size_t i = 0; i != node->names(); ++i)
values.emplace_back(node->nameAt(i), i);
return std::make_shared<DataTypeEnum16>(std::move(values));
}
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "ClickHouse supports only 8 and 16-bit Enum.");
}
case avro::Type::AVRO_FIXED:
return std::make_shared<DataTypeFixedString>(node->fixedSize());
case avro::Type::AVRO_ARRAY:
return std::make_shared<DataTypeArray>(avroNodeToDataType(node->leafAt(0)));
case avro::Type::AVRO_NULL:
return std::make_shared<DataTypeNothing>();
case avro::Type::AVRO_UNION:
if (node->leaves() == 2 && (node->leafAt(0)->type() == avro::Type::AVRO_NULL || node->leafAt(1)->type() == avro::Type::AVRO_NULL))
{
size_t nested_leaf_index = node->leafAt(0)->type() == avro::Type::AVRO_NULL ? 1 : 0;
return makeNullable(avroNodeToDataType(node->leafAt(nested_leaf_index)));
}
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Avro type UNION is not supported for inserting.");
case avro::Type::AVRO_SYMBOLIC:
return avroNodeToDataType(avro::resolveSymbol(node));
default:
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Avro column {} is not supported for inserting.");
}
}
void registerInputFormatAvro(FormatFactory & factory)
{
factory.registerInputFormat("Avro", [](
@ -836,6 +922,21 @@ void registerInputFormatAvro(FormatFactory & factory)
});
}
void registerAvroSchemaReader(FormatFactory & factory)
{
factory.registerSchemaReader("Avro", [](ReadBuffer & buf, const FormatSettings & settings, ContextPtr)
{
return std::make_shared<AvroSchemaReader>(buf, false, settings);
});
factory.registerSchemaReader("AvroConfluent", [](ReadBuffer & buf, const FormatSettings & settings, ContextPtr)
{
return std::make_shared<AvroSchemaReader>(buf, true, settings);
});
}
}
#else
@ -846,6 +947,8 @@ class FormatFactory;
void registerInputFormatAvro(FormatFactory &)
{
}
void registerAvroSchemaReader(FormatFactory &) {}
}
#endif

View File

@ -13,6 +13,7 @@
#include <Formats/FormatSettings.h>
#include <Formats/FormatSchemaInfo.h>
#include <Processors/Formats/IRowInputFormat.h>
#include <Processors/Formats/ISchemaReader.h>
#include <avro/DataFile.hh>
#include <avro/Decoder.hh>
@ -160,6 +161,20 @@ private:
FormatSettings format_settings;
};
class AvroSchemaReader : public ISchemaReader
{
public:
AvroSchemaReader(ReadBuffer & in_, bool confluent_, const FormatSettings & format_settings_);
NamesAndTypesList readSchema() override;
private:
DataTypePtr avroNodeToDataType(avro::NodePtr node);
bool confluent;
const FormatSettings format_settings;
};
}
#endif

View File

@ -5,7 +5,6 @@
#include <Formats/registerWithNamesAndTypes.h>
#include <DataTypes/DataTypeFactory.h>
namespace DB
{
@ -15,11 +14,23 @@ namespace ErrorCodes
}
BinaryRowInputFormat::BinaryRowInputFormat(ReadBuffer & in_, Block header, Params params_, bool with_names_, bool with_types_, const FormatSettings & format_settings_)
: RowInputFormatWithNamesAndTypes(std::move(header), in_, std::move(params_), with_names_, with_types_, format_settings_)
: RowInputFormatWithNamesAndTypes(
std::move(header),
in_,
std::move(params_),
with_names_,
with_types_,
format_settings_,
std::make_unique<BinaryFormatReader>(in_, format_settings_))
{
}
std::vector<String> BinaryRowInputFormat::readHeaderRow()
BinaryFormatReader::BinaryFormatReader(ReadBuffer & in_, const FormatSettings & format_settings_) : FormatWithNamesAndTypesReader(in_, format_settings_)
{
}
std::vector<String> BinaryFormatReader::readHeaderRow()
{
std::vector<String> fields;
String field;
@ -31,13 +42,13 @@ std::vector<String> BinaryRowInputFormat::readHeaderRow()
return fields;
}
std::vector<String> BinaryRowInputFormat::readNames()
std::vector<String> BinaryFormatReader::readNames()
{
readVarUInt(read_columns, *in);
return readHeaderRow();
}
std::vector<String> BinaryRowInputFormat::readTypes()
std::vector<String> BinaryFormatReader::readTypes()
{
auto types = readHeaderRow();
for (const auto & type_name : types)
@ -45,31 +56,37 @@ std::vector<String> BinaryRowInputFormat::readTypes()
return types;
}
bool BinaryRowInputFormat::readField(IColumn & column, const DataTypePtr & /*type*/, const SerializationPtr & serialization, bool /*is_last_file_column*/, const String & /*column_name*/)
bool BinaryFormatReader::readField(IColumn & column, const DataTypePtr & /*type*/, const SerializationPtr & serialization, bool /*is_last_file_column*/, const String & /*column_name*/)
{
serialization->deserializeBinary(column, *in);
return true;
}
void BinaryRowInputFormat::skipHeaderRow()
void BinaryFormatReader::skipHeaderRow()
{
String tmp;
for (size_t i = 0; i < read_columns; ++i)
readStringBinary(tmp, *in);
}
void BinaryRowInputFormat::skipNames()
void BinaryFormatReader::skipNames()
{
readVarUInt(read_columns, *in);
skipHeaderRow();
}
void BinaryRowInputFormat::skipTypes()
void BinaryFormatReader::skipTypes()
{
if (read_columns == 0)
{
/// It's possible only when with_names = false and with_types = true
readVarUInt(read_columns, *in);
}
skipHeaderRow();
}
void BinaryRowInputFormat::skipField(size_t file_column)
void BinaryFormatReader::skipField(size_t file_column)
{
if (file_column >= read_data_types.size())
throw Exception(ErrorCodes::CANNOT_SKIP_UNKNOWN_FIELD, "Cannot skip unknown field in RowBinaryWithNames format, because it's type is unknown");
@ -77,6 +94,11 @@ void BinaryRowInputFormat::skipField(size_t file_column)
read_data_types[file_column]->getDefaultSerialization()->deserializeBinary(field, *in);
}
BinaryWithNamesAndTypesSchemaReader::BinaryWithNamesAndTypesSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_)
: FormatWithNamesAndTypesSchemaReader(in_, 0, true, true, &reader), reader(in_, format_settings_)
{
}
void registerInputFormatRowBinary(FormatFactory & factory)
{
auto register_func = [&](const String & format_name, bool with_names, bool with_types)
@ -94,4 +116,13 @@ void registerInputFormatRowBinary(FormatFactory & factory)
registerWithNamesAndTypes("RowBinary", register_func);
}
void registerRowBinaryWithNamesAndTypesSchemaReader(FormatFactory & factory)
{
factory.registerSchemaReader("RowBinaryWithNamesAndTypes", [](ReadBuffer & buf, const FormatSettings & settings, ContextPtr)
{
return std::make_shared<BinaryWithNamesAndTypesSchemaReader>(buf, settings);
});
}
}

View File

@ -1,15 +1,19 @@
#pragma once
#include <Core/Block.h>
#include <Processors/Formats/IRowInputFormat.h>
#include <Processors/Formats/RowInputFormatWithNamesAndTypes.h>
#include <Processors/Formats/ISchemaReader.h>
namespace DB
{
class ReadBuffer;
namespace ErrorCodes
{
extern const int NOT_IMPLEMENTED;
}
class ReadBuffer;
/** A stream for inputting data in a binary line-by-line format.
*/
@ -24,9 +28,15 @@ public:
/// in this format we cannot provide any DiagnosticInfo, because here we have
/// just binary data.
std::string getDiagnosticInfo() override { return {}; }
};
class BinaryFormatReader : public FormatWithNamesAndTypesReader
{
public:
BinaryFormatReader(ReadBuffer & in_, const FormatSettings & format_settings_);
private:
bool readField(IColumn & column, const DataTypePtr & type, const SerializationPtr & serialization, bool is_last_file_column, const String & column_name) override;
void skipField(size_t file_column) override;
void skipNames() override;
@ -37,9 +47,24 @@ private:
std::vector<String> readTypes() override;
std::vector<String> readHeaderRow();
private:
/// Data types read from input data.
DataTypes read_data_types;
UInt64 read_columns = 0;
UInt64 read_columns;
};
class BinaryWithNamesAndTypesSchemaReader : public FormatWithNamesAndTypesSchemaReader
{
public:
BinaryWithNamesAndTypesSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_);
private:
DataTypes readRowAndGetDataTypes() override
{
throw Exception{ErrorCodes::NOT_IMPLEMENTED, "Method readRowAndGetDataTypes is not implemented"};
}
BinaryFormatReader reader;
};
}

View File

@ -5,13 +5,16 @@
#include <Formats/verbosePrintString.h>
#include <Formats/registerWithNamesAndTypes.h>
#include <Formats/FormatFactory.h>
#include <Formats/ReadSchemaUtils.h>
#include <Formats/EscapingRuleUtils.h>
#include <Processors/Formats/Impl/CSVRowInputFormat.h>
#include <DataTypes/Serializations/SerializationNullable.h>
#include <DataTypes/DataTypeNothing.h>
#include <DataTypes/DataTypeNullable.h>
#include <DataTypes/DataTypeString.h>
namespace DB
{
namespace ErrorCodes
{
extern const int BAD_ARGUMENTS;
@ -26,7 +29,14 @@ CSVRowInputFormat::CSVRowInputFormat(
bool with_names_,
bool with_types_,
const FormatSettings & format_settings_)
: RowInputFormatWithNamesAndTypes(header_, in_, params_, with_names_, with_types_, format_settings_)
: RowInputFormatWithNamesAndTypes(
header_,
in_,
params_,
with_names_,
with_types_,
format_settings_,
std::make_unique<CSVFormatReader>(in_, format_settings_))
{
const String bad_delimiters = " \t\"'.UL";
if (bad_delimiters.find(format_settings.csv.delimiter) != String::npos)
@ -36,6 +46,11 @@ CSVRowInputFormat::CSVRowInputFormat(
ErrorCodes::BAD_ARGUMENTS);
}
void CSVRowInputFormat::syncAfterError()
{
skipToNextLineOrEOF(*in);
}
static void skipEndOfLine(ReadBuffer & in)
{
/// \n (Unix) or \r\n (DOS/Windows) or \n\r (Mac OS Classic)
@ -52,8 +67,10 @@ static void skipEndOfLine(ReadBuffer & in)
if (!in.eof() && *in.position() == '\n')
++in.position();
else
throw Exception("Cannot parse CSV format: found \\r (CR) not followed by \\n (LF)."
" Line must end by \\n (LF) or \\r\\n (CR LF) or \\n\\r.", ErrorCodes::INCORRECT_DATA);
throw Exception(
"Cannot parse CSV format: found \\r (CR) not followed by \\n (LF)."
" Line must end by \\n (LF) or \\r\\n (CR LF) or \\n\\r.",
ErrorCodes::INCORRECT_DATA);
}
else if (!in.eof())
throw Exception("Expected end of line", ErrorCodes::INCORRECT_DATA);
@ -62,32 +79,38 @@ static void skipEndOfLine(ReadBuffer & in)
/// Skip `whitespace` symbols allowed in CSV.
static inline void skipWhitespacesAndTabs(ReadBuffer & in)
{
while (!in.eof()
&& (*in.position() == ' '
|| *in.position() == '\t'))
while (!in.eof() && (*in.position() == ' ' || *in.position() == '\t'))
++in.position();
}
void CSVRowInputFormat::skipFieldDelimiter()
CSVFormatReader::CSVFormatReader(ReadBuffer & in_, const FormatSettings & format_settings_) : FormatWithNamesAndTypesReader(in_, format_settings_)
{
}
void CSVFormatReader::skipFieldDelimiter()
{
skipWhitespacesAndTabs(*in);
assertChar(format_settings.csv.delimiter, *in);
}
String CSVRowInputFormat::readFieldIntoString()
template <bool read_string>
String CSVFormatReader::readCSVFieldIntoString()
{
skipWhitespacesAndTabs(*in);
String field;
readCSVString(field, *in, format_settings.csv);
if constexpr (read_string)
readCSVString(field, *in, format_settings.csv);
else
readCSVField(field, *in, format_settings.csv);
return field;
}
void CSVRowInputFormat::skipField()
void CSVFormatReader::skipField()
{
readFieldIntoString();
readCSVFieldIntoString<true>();
}
void CSVRowInputFormat::skipRowEndDelimiter()
void CSVFormatReader::skipRowEndDelimiter()
{
skipWhitespacesAndTabs(*in);
@ -105,33 +128,32 @@ void CSVRowInputFormat::skipRowEndDelimiter()
skipEndOfLine(*in);
}
void CSVRowInputFormat::skipHeaderRow()
void CSVFormatReader::skipHeaderRow()
{
do
{
skipField();
skipWhitespacesAndTabs(*in);
}
while (checkChar(format_settings.csv.delimiter, *in));
} while (checkChar(format_settings.csv.delimiter, *in));
skipRowEndDelimiter();
}
std::vector<String> CSVRowInputFormat::readHeaderRow()
template <bool is_header>
std::vector<String> CSVFormatReader::readRowImpl()
{
std::vector<String> fields;
do
{
fields.push_back(readFieldIntoString());
fields.push_back(readCSVFieldIntoString<is_header>());
skipWhitespacesAndTabs(*in);
}
while (checkChar(format_settings.csv.delimiter, *in));
} while (checkChar(format_settings.csv.delimiter, *in));
skipRowEndDelimiter();
return fields;
}
bool CSVRowInputFormat::parseFieldDelimiterWithDiagnosticInfo(WriteBuffer & out)
bool CSVFormatReader::parseFieldDelimiterWithDiagnosticInfo(WriteBuffer & out)
{
const char delimiter = format_settings.csv.delimiter;
@ -144,7 +166,8 @@ bool CSVRowInputFormat::parseFieldDelimiterWithDiagnosticInfo(WriteBuffer & out)
{
if (*in->position() == '\n' || *in->position() == '\r')
{
out << "ERROR: Line feed found where delimiter (" << delimiter << ") is expected."
out << "ERROR: Line feed found where delimiter (" << delimiter
<< ") is expected."
" It's like your file has less columns than expected.\n"
"And if your file has the right number of columns, maybe it has unescaped quotes in values.\n";
}
@ -160,7 +183,7 @@ bool CSVRowInputFormat::parseFieldDelimiterWithDiagnosticInfo(WriteBuffer & out)
return true;
}
bool CSVRowInputFormat::parseRowEndWithDiagnosticInfo(WriteBuffer & out)
bool CSVFormatReader::parseRowEndWithDiagnosticInfo(WriteBuffer & out)
{
skipWhitespacesAndTabs(*in);
@ -191,23 +214,21 @@ bool CSVRowInputFormat::parseRowEndWithDiagnosticInfo(WriteBuffer & out)
return true;
}
void CSVRowInputFormat::syncAfterError()
{
skipToNextLineOrEOF(*in);
}
bool CSVRowInputFormat::readField(IColumn & column, const DataTypePtr & type, const SerializationPtr & serialization, bool is_last_file_column, const String & /*column_name*/)
bool CSVFormatReader::readField(
IColumn & column,
const DataTypePtr & type,
const SerializationPtr & serialization,
bool is_last_file_column,
const String & /*column_name*/)
{
skipWhitespacesAndTabs(*in);
const bool at_delimiter = !in->eof() && *in->position() == format_settings.csv.delimiter;
const bool at_last_column_line_end = is_last_file_column
&& (in->eof() || *in->position() == '\n' || *in->position() == '\r');
const bool at_last_column_line_end = is_last_file_column && (in->eof() || *in->position() == '\n' || *in->position() == '\r');
/// Note: Tuples are serialized in CSV as separate columns, but with empty_as_default or null_as_default
/// only one empty or NULL column will be expected
if (format_settings.csv.empty_as_default
&& (at_delimiter || at_last_column_line_end))
if (format_settings.csv.empty_as_default && (at_delimiter || at_last_column_line_end))
{
/// Treat empty unquoted column value as default value, if
/// specified in the settings. Tuple columns might seem
@ -231,6 +252,31 @@ bool CSVRowInputFormat::readField(IColumn & column, const DataTypePtr & type, co
}
}
CSVSchemaReader::CSVSchemaReader(ReadBuffer & in_, bool with_names_, bool with_types_, const FormatSettings & format_setting_, ContextPtr context_)
: FormatWithNamesAndTypesSchemaReader(
in_,
format_setting_.max_rows_to_read_for_schema_inference,
with_names_,
with_types_,
&reader,
getDefaultDataTypeForEscapingRule(FormatSettings::EscapingRule::CSV))
, reader(in_, format_setting_)
, context(context_)
{
}
DataTypes CSVSchemaReader::readRowAndGetDataTypes()
{
if (in.eof())
return {};
auto fields = reader.readRow();
return determineDataTypesByEscapingRule(fields, reader.getFormatSettings(), FormatSettings::EscapingRule::CSV, context);
}
void registerInputFormatCSV(FormatFactory & factory)
{
auto register_func = [&](const String & format_name, bool with_names, bool with_types)
@ -326,4 +372,17 @@ void registerFileSegmentationEngineCSV(FormatFactory & factory)
registerWithNamesAndTypes("CSV", register_func);
}
void registerCSVSchemaReader(FormatFactory & factory)
{
auto register_func = [&](const String & format_name, bool with_names, bool with_types)
{
factory.registerSchemaReader(format_name, [with_names, with_types](ReadBuffer & buf, const FormatSettings & settings, ContextPtr context)
{
return std::make_shared<CSVSchemaReader>(buf, with_names, with_types, settings, context);
});
};
registerWithNamesAndTypes("CSV", register_func);
}
}

View File

@ -5,6 +5,7 @@
#include <Core/Block.h>
#include <Processors/Formats/RowInputFormatWithNamesAndTypes.h>
#include <Processors/Formats/ISchemaReader.h>
#include <Formats/FormatSettings.h>
@ -28,6 +29,12 @@ public:
private:
bool allowSyncAfterError() const override { return true; }
void syncAfterError() override;
};
class CSVFormatReader : public FormatWithNamesAndTypesReader
{
public:
CSVFormatReader(ReadBuffer & in_, const FormatSettings & format_settings_);
bool parseFieldDelimiterWithDiagnosticInfo(WriteBuffer & out) override;
bool parseRowEndWithDiagnosticInfo(WriteBuffer & out) override;
@ -42,17 +49,34 @@ private:
void skipField(size_t /*file_column*/) override { skipField(); }
void skipField();
void skipHeaderRow() ;
void skipHeaderRow();
void skipNames() override { skipHeaderRow(); }
void skipTypes() override { skipHeaderRow(); }
void skipFieldDelimiter() override;
void skipRowEndDelimiter() override;
std::vector<String> readHeaderRow();
std::vector<String> readNames() override { return readHeaderRow(); }
std::vector<String> readTypes() override { return readHeaderRow(); }
std::vector<String> readHeaderRow() { return readRowImpl<true>(); }
std::vector<String> readRow() { return readRowImpl<false>(); }
String readFieldIntoString();
template <bool is_header>
std::vector<String> readRowImpl();
template <bool read_string>
String readCSVFieldIntoString();
};
class CSVSchemaReader : public FormatWithNamesAndTypesSchemaReader
{
public:
CSVSchemaReader(ReadBuffer & in_, bool with_names_, bool with_types_, const FormatSettings & format_setting_, ContextPtr context_);
private:
DataTypes readRowAndGetDataTypes() override;
CSVFormatReader reader;
ContextPtr context;
};
}

View File

@ -273,6 +273,7 @@ bool CapnProtoRowInputFormat::readRow(MutableColumns & columns, RowReadExtension
#endif
auto root_reader = msg.getRoot<capnp::DynamicStruct>(root);
for (size_t i = 0; i != columns.size(); ++i)
{
auto value = getReaderByColumnName(root_reader, column_names[i]);
@ -282,6 +283,24 @@ bool CapnProtoRowInputFormat::readRow(MutableColumns & columns, RowReadExtension
return true;
}
CapnProtoSchemaReader::CapnProtoSchemaReader(const FormatSettings & format_settings_) : format_settings(format_settings_)
{
}
NamesAndTypesList CapnProtoSchemaReader::readSchema()
{
auto schema_info = FormatSchemaInfo(
format_settings.schema.format_schema,
"CapnProto",
true,
format_settings.schema.is_server,
format_settings.schema.format_schema_path);
auto schema_parser = CapnProtoSchemaParser();
auto schema = schema_parser.getMessageSchema(schema_info);
return capnProtoSchemaToCHSchema(schema);
}
void registerInputFormatCapnProto(FormatFactory & factory)
{
factory.registerInputFormat(
@ -293,6 +312,14 @@ void registerInputFormatCapnProto(FormatFactory & factory)
});
}
void registerCapnProtoSchemaReader(FormatFactory & factory)
{
factory.registerExternalSchemaReader("CapnProto", [](const FormatSettings & settings)
{
return std::make_shared<CapnProtoSchemaReader>(settings);
});
}
}
#else
@ -301,6 +328,7 @@ namespace DB
{
class FormatFactory;
void registerInputFormatCapnProto(FormatFactory &) {}
void registerCapnProtoSchemaReader(FormatFactory &) {}
}
#endif // USE_CAPNP

View File

@ -6,6 +6,7 @@
#include <Core/Block.h>
#include <Formats/CapnProtoUtils.h>
#include <Processors/Formats/IRowInputFormat.h>
#include <Processors/Formats/ISchemaReader.h>
namespace DB
{
@ -38,6 +39,17 @@ private:
Names column_names;
};
class CapnProtoSchemaReader : public IExternalSchemaReader
{
public:
explicit CapnProtoSchemaReader(const FormatSettings & format_settings_);
NamesAndTypesList readSchema() override;
private:
const FormatSettings format_settings;
};
}
#endif // USE_CAPNP

View File

@ -31,7 +31,7 @@ CustomSeparatedRowInputFormat::CustomSeparatedRowInputFormat(
bool ignore_spaces_,
const FormatSettings & format_settings_)
: CustomSeparatedRowInputFormat(
header_, std::make_unique<PeekableReadBuffer>(in_buf_), params_, with_names_, with_types_, ignore_spaces_, format_settings_)
header_, std::make_unique<PeekableReadBuffer>(in_buf_), params_, with_names_, with_types_, ignore_spaces_, updateFormatSettings(format_settings_))
{
}
@ -43,10 +43,15 @@ CustomSeparatedRowInputFormat::CustomSeparatedRowInputFormat(
bool with_types_,
bool ignore_spaces_,
const FormatSettings & format_settings_)
: RowInputFormatWithNamesAndTypes(header_, *buf_, params_, with_names_, with_types_, updateFormatSettings(format_settings_))
: RowInputFormatWithNamesAndTypes(
header_,
*buf_,
params_,
with_names_,
with_types_,
format_settings_,
std::make_unique<CustomSeparatedFormatReader>(*buf_, ignore_spaces_, format_settings_))
, buf(std::move(buf_))
, ignore_spaces(ignore_spaces_)
, escaping_rule(format_settings_.custom.escaping_rule)
{
/// In case of CustomSeparatedWithNames(AndTypes) formats and enabled setting input_format_with_names_use_header we don't know
/// the exact number of columns in data (because it can contain unknown columns). So, if field_delimiter and row_after_delimiter are
@ -61,43 +66,76 @@ CustomSeparatedRowInputFormat::CustomSeparatedRowInputFormat(
}
}
void CustomSeparatedRowInputFormat::skipPrefixBeforeHeader()
bool CustomSeparatedRowInputFormat::allowSyncAfterError() const
{
return !format_settings.custom.row_after_delimiter.empty() || !format_settings.custom.row_between_delimiter.empty();
}
void CustomSeparatedRowInputFormat::syncAfterError()
{
skipToNextRowOrEof(*buf, format_settings.custom.row_after_delimiter, format_settings.custom.row_between_delimiter, ignore_spaces);
end_of_stream = buf->eof();
/// It can happen that buf->position() is not at the beginning of row
/// if some delimiters is similar to row_format.delimiters.back() and row_between_delimiter.
/// It will cause another parsing error.
}
void CustomSeparatedRowInputFormat::setReadBuffer(ReadBuffer & in_)
{
buf = std::make_unique<PeekableReadBuffer>(in_);
RowInputFormatWithNamesAndTypes::setReadBuffer(*buf);
}
CustomSeparatedFormatReader::CustomSeparatedFormatReader(
PeekableReadBuffer & buf_, bool ignore_spaces_, const FormatSettings & format_settings_)
: FormatWithNamesAndTypesReader(buf_, format_settings_), buf(&buf_), ignore_spaces(ignore_spaces_)
{
}
void CustomSeparatedRowInputFormat::resetParser()
{
RowInputFormatWithNamesAndTypes::resetParser();
buf->reset();
}
void CustomSeparatedFormatReader::skipPrefixBeforeHeader()
{
skipSpaces();
assertString(format_settings.custom.result_before_delimiter, *buf);
}
void CustomSeparatedRowInputFormat::skipRowStartDelimiter()
void CustomSeparatedFormatReader::skipRowStartDelimiter()
{
skipSpaces();
assertString(format_settings.custom.row_before_delimiter, *buf);
}
void CustomSeparatedRowInputFormat::skipFieldDelimiter()
void CustomSeparatedFormatReader::skipFieldDelimiter()
{
skipSpaces();
assertString(format_settings.custom.field_delimiter, *buf);
}
void CustomSeparatedRowInputFormat::skipRowEndDelimiter()
void CustomSeparatedFormatReader::skipRowEndDelimiter()
{
skipSpaces();
assertString(format_settings.custom.row_after_delimiter, *buf);
}
void CustomSeparatedRowInputFormat::skipRowBetweenDelimiter()
void CustomSeparatedFormatReader::skipRowBetweenDelimiter()
{
skipSpaces();
assertString(format_settings.custom.row_between_delimiter, *buf);
}
void CustomSeparatedRowInputFormat::skipField()
void CustomSeparatedFormatReader::skipField()
{
skipSpaces();
skipFieldByEscapingRule(*buf, escaping_rule, format_settings);
skipFieldByEscapingRule(*buf, format_settings.custom.escaping_rule, format_settings);
}
bool CustomSeparatedRowInputFormat::checkEndOfRow()
bool CustomSeparatedFormatReader::checkEndOfRow()
{
PeekableReadBufferCheckpoint checkpoint{*buf, true};
@ -118,43 +156,66 @@ bool CustomSeparatedRowInputFormat::checkEndOfRow()
return checkForSuffixImpl(true);
}
std::vector<String> CustomSeparatedRowInputFormat::readHeaderRow()
template <bool is_header>
String CustomSeparatedFormatReader::readFieldIntoString(bool is_first)
{
if (!is_first)
skipFieldDelimiter();
skipSpaces();
if constexpr (is_header)
return readStringByEscapingRule(*buf, format_settings.custom.escaping_rule, format_settings);
else
return readFieldByEscapingRule(*buf, format_settings.custom.escaping_rule, format_settings);
}
template <bool is_header>
std::vector<String> CustomSeparatedFormatReader::readRowImpl()
{
std::vector<String> values;
skipRowStartDelimiter();
do
if (columns == 0)
{
if (!values.empty())
skipFieldDelimiter();
skipSpaces();
values.push_back(readStringByEscapingRule(*buf, escaping_rule, format_settings));
do
{
values.push_back(readFieldIntoString<is_header>(values.empty()));
} while (!checkEndOfRow());
columns = values.size();
}
else
{
for (size_t i = 0; i != columns; ++i)
values.push_back(readFieldIntoString<is_header>(i == 0));
}
while (!checkEndOfRow());
skipRowEndDelimiter();
return values;
}
void CustomSeparatedRowInputFormat::skipHeaderRow()
void CustomSeparatedFormatReader::skipHeaderRow()
{
size_t columns = getPort().getHeader().columns();
skipRowStartDelimiter();
for (size_t i = 0; i != columns; ++i)
bool first = true;
do
{
skipField();
if (i + 1 != columns)
if (!first)
skipFieldDelimiter();
first = false;
skipField();
}
while (!checkEndOfRow());
skipRowEndDelimiter();
}
bool CustomSeparatedRowInputFormat::readField(IColumn & column, const DataTypePtr & type, const SerializationPtr & serialization, bool, const String &)
bool CustomSeparatedFormatReader::readField(IColumn & column, const DataTypePtr & type, const SerializationPtr & serialization, bool, const String &)
{
skipSpaces();
return deserializeFieldByEscapingRule(type, serialization, column, *buf, escaping_rule, format_settings);
return deserializeFieldByEscapingRule(type, serialization, column, *buf, format_settings.custom.escaping_rule, format_settings);
}
bool CustomSeparatedRowInputFormat::checkForSuffixImpl(bool check_eof)
bool CustomSeparatedFormatReader::checkForSuffixImpl(bool check_eof)
{
skipSpaces();
if (format_settings.custom.result_after_delimiter.empty())
@ -177,7 +238,7 @@ bool CustomSeparatedRowInputFormat::checkForSuffixImpl(bool check_eof)
return false;
}
bool CustomSeparatedRowInputFormat::tryParseSuffixWithDiagnosticInfo(WriteBuffer & out)
bool CustomSeparatedFormatReader::tryParseSuffixWithDiagnosticInfo(WriteBuffer & out)
{
PeekableReadBufferCheckpoint checkpoint{*buf};
if (checkForSuffixImpl(false))
@ -192,7 +253,7 @@ bool CustomSeparatedRowInputFormat::tryParseSuffixWithDiagnosticInfo(WriteBuffer
return true;
}
bool CustomSeparatedRowInputFormat::checkForSuffix()
bool CustomSeparatedFormatReader::checkForSuffix()
{
PeekableReadBufferCheckpoint checkpoint{*buf};
if (checkForSuffixImpl(true))
@ -201,51 +262,60 @@ bool CustomSeparatedRowInputFormat::checkForSuffix()
return false;
}
bool CustomSeparatedRowInputFormat::allowSyncAfterError() const
{
return !format_settings.custom.row_after_delimiter.empty() || !format_settings.custom.row_between_delimiter.empty();
}
void CustomSeparatedRowInputFormat::syncAfterError()
{
skipToNextRowOrEof(*buf, format_settings.custom.row_after_delimiter, format_settings.custom.row_between_delimiter, ignore_spaces);
end_of_stream = buf->eof();
/// It can happen that buf->position() is not at the beginning of row
/// if some delimiters is similar to row_format.delimiters.back() and row_between_delimiter.
/// It will cause another parsing error.
}
bool CustomSeparatedRowInputFormat::parseRowStartWithDiagnosticInfo(WriteBuffer & out)
bool CustomSeparatedFormatReader::parseRowStartWithDiagnosticInfo(WriteBuffer & out)
{
return parseDelimiterWithDiagnosticInfo(out, *buf, format_settings.custom.row_before_delimiter, "delimiter before first field", ignore_spaces);
}
bool CustomSeparatedRowInputFormat::parseFieldDelimiterWithDiagnosticInfo(WriteBuffer & out)
bool CustomSeparatedFormatReader::parseFieldDelimiterWithDiagnosticInfo(WriteBuffer & out)
{
return parseDelimiterWithDiagnosticInfo(out, *buf, format_settings.custom.field_delimiter, "delimiter between fields", ignore_spaces);
}
bool CustomSeparatedRowInputFormat::parseRowEndWithDiagnosticInfo(WriteBuffer & out)
bool CustomSeparatedFormatReader::parseRowEndWithDiagnosticInfo(WriteBuffer & out)
{
return parseDelimiterWithDiagnosticInfo(out, *buf, format_settings.custom.row_after_delimiter, "delimiter after last field", ignore_spaces);
}
bool CustomSeparatedRowInputFormat::parseRowBetweenDelimiterWithDiagnosticInfo(WriteBuffer & out)
bool CustomSeparatedFormatReader::parseRowBetweenDelimiterWithDiagnosticInfo(WriteBuffer & out)
{
return parseDelimiterWithDiagnosticInfo(out, *buf, format_settings.custom.row_between_delimiter, "delimiter between rows", ignore_spaces);
}
void CustomSeparatedRowInputFormat::resetParser()
void CustomSeparatedFormatReader::setReadBuffer(ReadBuffer & in_)
{
RowInputFormatWithNamesAndTypes::resetParser();
buf->reset();
buf = assert_cast<PeekableReadBuffer *>(&in_);
FormatWithNamesAndTypesReader::setReadBuffer(in_);
}
void CustomSeparatedRowInputFormat::setReadBuffer(ReadBuffer & in_)
CustomSeparatedSchemaReader::CustomSeparatedSchemaReader(
ReadBuffer & in_, bool with_names_, bool with_types_, bool ignore_spaces_, const FormatSettings & format_setting_, ContextPtr context_)
: FormatWithNamesAndTypesSchemaReader(
buf,
format_setting_.max_rows_to_read_for_schema_inference,
with_names_,
with_types_,
&reader,
getDefaultDataTypeForEscapingRule(format_setting_.custom.escaping_rule))
, buf(in_)
, reader(buf, ignore_spaces_, updateFormatSettings(format_setting_))
, context(context_)
{
buf = std::make_unique<PeekableReadBuffer>(in_);
IInputFormat::setReadBuffer(*buf);
}
DataTypes CustomSeparatedSchemaReader::readRowAndGetDataTypes()
{
if (reader.checkForSuffix())
return {};
if (!first_row || with_names || with_types)
reader.skipRowBetweenDelimiter();
if (first_row)
first_row = false;
auto fields = reader.readRow();
return determineDataTypesByEscapingRule(fields, reader.getFormatSettings(), reader.getEscapingRule(), context);
}
void registerInputFormatCustomSeparated(FormatFactory & factory)
@ -267,4 +337,20 @@ void registerInputFormatCustomSeparated(FormatFactory & factory)
}
}
void registerCustomSeparatedSchemaReader(FormatFactory & factory)
{
for (bool ignore_spaces : {false, true})
{
auto register_func = [&](const String & format_name, bool with_names, bool with_types)
{
factory.registerSchemaReader(format_name, [with_names, with_types, ignore_spaces](ReadBuffer & buf, const FormatSettings & settings, ContextPtr context)
{
return std::make_shared<CustomSeparatedSchemaReader>(buf, with_names, with_types, ignore_spaces, settings, context);
});
};
registerWithNamesAndTypes(ignore_spaces ? "CustomSeparatedIgnoreSpaces" : "CustomSeparated", register_func);
}
}
}

View File

@ -19,7 +19,6 @@ public:
void resetParser() override;
String getName() const override { return "CustomSeparatedRowInputFormat"; }
void setReadBuffer(ReadBuffer & in_) override;
private:
@ -28,6 +27,19 @@ private:
std::unique_ptr<PeekableReadBuffer> in_buf_,
const Params & params_,
bool with_names_, bool with_types_, bool ignore_spaces_, const FormatSettings & format_settings_);
bool allowSyncAfterError() const override;
void syncAfterError() override;
std::unique_ptr<PeekableReadBuffer> buf;
bool ignore_spaces;
};
class CustomSeparatedFormatReader : public FormatWithNamesAndTypesReader
{
public:
CustomSeparatedFormatReader(PeekableReadBuffer & buf_, bool ignore_spaces_, const FormatSettings & format_settings_);
using EscapingRule = FormatSettings::EscapingRule;
bool readField(IColumn & column, const DataTypePtr & type, const SerializationPtr & serialization, bool is_last_file_column, const String & column_name) override;
@ -46,9 +58,6 @@ private:
bool checkForSuffix() override;
bool allowSyncAfterError() const override;
void syncAfterError() override;
bool parseRowStartWithDiagnosticInfo(WriteBuffer & out) override;
bool parseFieldDelimiterWithDiagnosticInfo(WriteBuffer & out) override;
bool parseRowEndWithDiagnosticInfo(WriteBuffer & out) override;
@ -57,15 +66,41 @@ private:
std::vector<String> readNames() override { return readHeaderRow(); }
std::vector<String> readTypes() override { return readHeaderRow(); }
std::vector<String> readHeaderRow();
std::vector<String> readHeaderRow() {return readRowImpl<true>(); }
std::vector<String> readRow() { return readRowImpl<false>(); }
bool checkEndOfRow();
bool checkForSuffixImpl(bool check_eof);
inline void skipSpaces() { if (ignore_spaces) skipWhitespaceIfAny(*buf); }
std::unique_ptr<PeekableReadBuffer> buf;
EscapingRule getEscapingRule() { return format_settings.custom.escaping_rule; }
void setReadBuffer(ReadBuffer & in_) override;
private:
template <bool is_header>
std::vector<String> readRowImpl();
template <bool read_string>
String readFieldIntoString(bool is_first);
PeekableReadBuffer * buf;
bool ignore_spaces;
EscapingRule escaping_rule;
size_t columns = 0;
};
class CustomSeparatedSchemaReader : public FormatWithNamesAndTypesSchemaReader
{
public:
CustomSeparatedSchemaReader(ReadBuffer & in_, bool with_names_, bool with_types_, bool ignore_spaces_, const FormatSettings & format_setting_, ContextPtr context_);
private:
DataTypes readRowAndGetDataTypes() override;
PeekableReadBuffer buf;
CustomSeparatedFormatReader reader;
ContextPtr context;
bool first_row = true;
};
}

View File

@ -202,4 +202,12 @@ void registerNonTrivialPrefixAndSuffixCheckerJSONAsString(FormatFactory & factor
factory.registerNonTrivialPrefixAndSuffixChecker("JSONAsString", nonTrivialPrefixAndSuffixCheckerJSONEachRowImpl);
}
void registerJSONAsStringSchemaReader(FormatFactory & factory)
{
factory.registerExternalSchemaReader("JSONAsString", [](const FormatSettings &)
{
return std::make_shared<JSONAsStringExternalSchemaReader>();
});
}
}

View File

@ -1,8 +1,10 @@
#pragma once
#include <Processors/Formats/IRowInputFormat.h>
#include <Processors/Formats/ISchemaReader.h>
#include <Formats/FormatFactory.h>
#include <IO/PeekableReadBuffer.h>
#include <DataTypes/DataTypeString.h>
namespace DB
{
@ -39,4 +41,13 @@ private:
bool allow_new_rows = true;
};
class JSONAsStringExternalSchemaReader : public IExternalSchemaReader
{
public:
NamesAndTypesList readSchema() override
{
return {{"json", std::make_shared<DataTypeString>()}};
}
};
}

View File

@ -1,6 +1,7 @@
#include <Processors/Formats/Impl/JSONCompactEachRowRowInputFormat.h>
#include <IO/ReadHelpers.h>
#include <IO/PeekableReadBuffer.h>
#include <IO/Operators.h>
#include <Formats/FormatFactory.h>
#include <Formats/verbosePrintString.h>
@ -8,16 +9,13 @@
#include <Formats/registerWithNamesAndTypes.h>
#include <DataTypes/NestedUtils.h>
#include <DataTypes/Serializations/SerializationNullable.h>
#include <DataTypes/DataTypeFactory.h>
#include <DataTypes/DataTypeNullable.h>
#include <Poco/JSON/Parser.h>
namespace DB
{
namespace ErrorCodes
{
extern const int INCORRECT_DATA;
}
JSONCompactEachRowRowInputFormat::JSONCompactEachRowRowInputFormat(
const Block & header_,
ReadBuffer & in_,
@ -26,24 +24,40 @@ JSONCompactEachRowRowInputFormat::JSONCompactEachRowRowInputFormat(
bool with_types_,
bool yield_strings_,
const FormatSettings & format_settings_)
: RowInputFormatWithNamesAndTypes(header_, in_, std::move(params_), with_names_, with_types_, format_settings_)
, yield_strings(yield_strings_)
: RowInputFormatWithNamesAndTypes(
header_,
in_,
std::move(params_),
with_names_,
with_types_,
format_settings_,
std::make_unique<JSONCompactEachRowFormatReader>(in_, yield_strings_, format_settings_))
{
}
void JSONCompactEachRowRowInputFormat::skipRowStartDelimiter()
void JSONCompactEachRowRowInputFormat::syncAfterError()
{
skipToUnescapedNextLineOrEOF(*in);
}
JSONCompactEachRowFormatReader::JSONCompactEachRowFormatReader(ReadBuffer & in_, bool yield_strings_, const FormatSettings & format_settings_)
: FormatWithNamesAndTypesReader(in_, format_settings_), yield_strings(yield_strings_)
{
}
void JSONCompactEachRowFormatReader::skipRowStartDelimiter()
{
skipWhitespaceIfAny(*in);
assertChar('[', *in);
}
void JSONCompactEachRowRowInputFormat::skipFieldDelimiter()
void JSONCompactEachRowFormatReader::skipFieldDelimiter()
{
skipWhitespaceIfAny(*in);
assertChar(',', *in);
}
void JSONCompactEachRowRowInputFormat::skipRowEndDelimiter()
void JSONCompactEachRowFormatReader::skipRowEndDelimiter()
{
skipWhitespaceIfAny(*in);
assertChar(']', *in);
@ -55,29 +69,18 @@ void JSONCompactEachRowRowInputFormat::skipRowEndDelimiter()
skipWhitespaceIfAny(*in);
}
String JSONCompactEachRowRowInputFormat::readFieldIntoString()
void JSONCompactEachRowFormatReader::skipField()
{
skipWhitespaceIfAny(*in);
String field;
readJSONString(field, *in);
return field;
skipJSONField(*in, "skipped_field");
}
void JSONCompactEachRowRowInputFormat::skipField(size_t file_column)
{
skipWhitespaceIfAny(*in);
skipJSONField(*in, column_mapping->names_of_columns[file_column]);
}
void JSONCompactEachRowRowInputFormat::skipHeaderRow()
void JSONCompactEachRowFormatReader::skipHeaderRow()
{
skipRowStartDelimiter();
size_t i = 0;
do
{
if (i >= column_mapping->names_of_columns.size())
throw Exception(ErrorCodes::INCORRECT_DATA, "The number of columns in a row differs from the number of column names");
skipField(i++);
skipField();
skipWhitespaceIfAny(*in);
}
while (checkChar(',', *in));
@ -85,13 +88,16 @@ void JSONCompactEachRowRowInputFormat::skipHeaderRow()
skipRowEndDelimiter();
}
std::vector<String> JSONCompactEachRowRowInputFormat::readHeaderRow()
std::vector<String> JSONCompactEachRowFormatReader::readHeaderRow()
{
skipRowStartDelimiter();
std::vector<String> fields;
String field;
do
{
fields.push_back(readFieldIntoString());
skipWhitespaceIfAny(*in);
readJSONString(field, *in);
fields.push_back(field);
skipWhitespaceIfAny(*in);
}
while (checkChar(',', *in));
@ -100,18 +106,13 @@ std::vector<String> JSONCompactEachRowRowInputFormat::readHeaderRow()
return fields;
}
bool JSONCompactEachRowRowInputFormat::readField(IColumn & column, const DataTypePtr & type, const SerializationPtr & serialization, bool /*is_last_file_column*/, const String & column_name)
bool JSONCompactEachRowFormatReader::readField(IColumn & column, const DataTypePtr & type, const SerializationPtr & serialization, bool /*is_last_file_column*/, const String & column_name)
{
skipWhitespaceIfAny(*in);
return readFieldImpl(*in, column, type, serialization, column_name, format_settings, yield_strings);
}
void JSONCompactEachRowRowInputFormat::syncAfterError()
{
skipToUnescapedNextLineOrEOF(*in);
}
bool JSONCompactEachRowRowInputFormat::parseRowStartWithDiagnosticInfo(WriteBuffer & out)
bool JSONCompactEachRowFormatReader::parseRowStartWithDiagnosticInfo(WriteBuffer & out)
{
skipWhitespaceIfAny(*in);
if (!checkChar('[', *in))
@ -123,7 +124,7 @@ bool JSONCompactEachRowRowInputFormat::parseRowStartWithDiagnosticInfo(WriteBuff
return true;
}
bool JSONCompactEachRowRowInputFormat::parseFieldDelimiterWithDiagnosticInfo(WriteBuffer & out)
bool JSONCompactEachRowFormatReader::parseFieldDelimiterWithDiagnosticInfo(WriteBuffer & out)
{
try
{
@ -150,7 +151,7 @@ bool JSONCompactEachRowRowInputFormat::parseFieldDelimiterWithDiagnosticInfo(Wri
return true;
}
bool JSONCompactEachRowRowInputFormat::parseRowEndWithDiagnosticInfo(WriteBuffer & out)
bool JSONCompactEachRowFormatReader::parseRowEndWithDiagnosticInfo(WriteBuffer & out)
{
skipWhitespaceIfAny(*in);
@ -180,6 +181,20 @@ bool JSONCompactEachRowRowInputFormat::parseRowEndWithDiagnosticInfo(WriteBuffer
return true;
}
JSONCompactEachRowRowSchemaReader::JSONCompactEachRowRowSchemaReader(ReadBuffer & in_, bool with_names_, bool with_types_, bool yield_strings_, const FormatSettings & format_settings_)
: FormatWithNamesAndTypesSchemaReader(in_, format_settings_.max_rows_to_read_for_schema_inference, with_names_, with_types_, &reader), reader(in_, yield_strings_, format_settings_)
{
}
DataTypes JSONCompactEachRowRowSchemaReader::readRowAndGetDataTypes()
{
skipWhitespaceIfAny(in);
if (in.eof())
return {};
return readRowAndGetDataTypesForJSONCompactEachRow(in, reader.yieldStrings());
}
void registerInputFormatJSONCompactEachRow(FormatFactory & factory)
{
for (bool yield_strings : {true, false})
@ -200,6 +215,21 @@ void registerInputFormatJSONCompactEachRow(FormatFactory & factory)
}
}
void registerJSONCompactEachRowSchemaReader(FormatFactory & factory)
{
for (bool json_strings : {false, true})
{
auto register_func = [&](const String & format_name, bool with_names, bool with_types)
{
factory.registerSchemaReader(format_name, [=](ReadBuffer & buf, const FormatSettings & settings, ContextPtr)
{
return std::make_shared<JSONCompactEachRowRowSchemaReader>(buf, with_names, with_types, json_strings, settings);
});
};
registerWithNamesAndTypes(json_strings ? "JSONCompactStringsEachRow" : "JSONCompactEachRow", register_func);
}
}
void registerFileSegmentationEngineJSONCompactEachRow(FormatFactory & factory)
{
auto register_func = [&](const String & format_name, bool with_names, bool with_types)

View File

@ -2,6 +2,7 @@
#include <Core/Block.h>
#include <Processors/Formats/RowInputFormatWithNamesAndTypes.h>
#include <Processors/Formats/ISchemaReader.h>
#include <Formats/FormatSettings.h>
#include <Common/HashTable/HashMap.h>
@ -10,6 +11,7 @@ namespace DB
class ReadBuffer;
/** A stream for reading data in a bunch of formats:
* - JSONCompactEachRow
* - JSONCompactEachRowWithNamesAndTypes
@ -34,6 +36,13 @@ public:
private:
bool allowSyncAfterError() const override { return true; }
void syncAfterError() override;
};
class JSONCompactEachRowFormatReader : public FormatWithNamesAndTypesReader
{
public:
JSONCompactEachRowFormatReader(ReadBuffer & in_, bool yield_strings_, const FormatSettings & format_settings_);
bool parseRowStartWithDiagnosticInfo(WriteBuffer & out) override;
bool parseFieldDelimiterWithDiagnosticInfo(WriteBuffer & out) override;
@ -45,7 +54,8 @@ private:
bool readField(IColumn & column, const DataTypePtr & type, const SerializationPtr & serialization, bool is_last_file_column, const String & column_name) override;
void skipField(size_t file_column) override;
void skipField(size_t /*column_index*/) override { skipField(); }
void skipField();
void skipHeaderRow();
void skipNames() override { skipHeaderRow(); }
void skipTypes() override { skipHeaderRow(); }
@ -56,9 +66,21 @@ private:
std::vector<String> readHeaderRow();
std::vector<String> readNames() override { return readHeaderRow(); }
std::vector<String> readTypes() override { return readHeaderRow(); }
String readFieldIntoString();
bool yieldStrings() const { return yield_strings; }
private:
bool yield_strings;
};
class JSONCompactEachRowRowSchemaReader : public FormatWithNamesAndTypesSchemaReader
{
public:
JSONCompactEachRowRowSchemaReader(ReadBuffer & in_, bool with_names_, bool with_types_, bool yield_strings_, const FormatSettings & format_settings_);
private:
DataTypes readRowAndGetDataTypes() override;
JSONCompactEachRowFormatReader reader;
};
}

View File

@ -6,6 +6,7 @@
#include <Formats/FormatFactory.h>
#include <DataTypes/NestedUtils.h>
#include <DataTypes/Serializations/SerializationNullable.h>
#include <DataTypes/getLeastSupertype.h>
namespace DB
{
@ -286,11 +287,7 @@ void JSONEachRowRowInputFormat::readPrefix()
skipBOMIfExists(*in);
skipWhitespaceIfAny(*in);
if (!in->eof() && *in->position() == '[')
{
++in->position();
data_in_square_brackets = true;
}
data_in_square_brackets = checkChar('[', *in);
}
void JSONEachRowRowInputFormat::readSuffix()
@ -309,6 +306,28 @@ void JSONEachRowRowInputFormat::readSuffix()
assertEOF(*in);
}
JSONEachRowSchemaReader::JSONEachRowSchemaReader(ReadBuffer & in_, bool json_strings_, const FormatSettings & format_settings)
: IRowWithNamesSchemaReader(in_, format_settings.max_rows_to_read_for_schema_inference), json_strings(json_strings_)
{
}
std::unordered_map<String, DataTypePtr> JSONEachRowSchemaReader::readRowAndGetNamesAndDataTypes()
{
if (first_row)
{
skipBOMIfExists(in);
skipWhitespaceIfAny(in);
checkChar('[', in);
first_row = false;
}
skipWhitespaceIfAny(in);
if (in.eof())
return {};
return readRowAndGetNamesAndDataTypesForJSONEachRow(in, json_strings);
}
void registerInputFormatJSONEachRow(FormatFactory & factory)
{
@ -343,4 +362,17 @@ void registerNonTrivialPrefixAndSuffixCheckerJSONEachRow(FormatFactory & factory
factory.registerNonTrivialPrefixAndSuffixChecker("JSONStringsEachRow", nonTrivialPrefixAndSuffixCheckerJSONEachRowImpl);
}
void registerJSONEachRowSchemaReader(FormatFactory & factory)
{
factory.registerSchemaReader("JSONEachRow", [](ReadBuffer & buf, const FormatSettings & settings, ContextPtr)
{
return std::make_unique<JSONEachRowSchemaReader>(buf, false, settings);
});
factory.registerSchemaReader("JSONStringsEachRow", [](ReadBuffer & buf, const FormatSettings & settings, ContextPtr)
{
return std::make_unique<JSONEachRowSchemaReader>(buf, true, settings);
});
}
}

View File

@ -2,6 +2,7 @@
#include <Core/Block.h>
#include <Processors/Formats/IRowInputFormat.h>
#include <Processors/Formats/ISchemaReader.h>
#include <Formats/FormatSettings.h>
#include <Common/HashTable/HashMap.h>
@ -84,4 +85,16 @@ private:
bool yield_strings;
};
class JSONEachRowSchemaReader : public IRowWithNamesSchemaReader
{
public:
JSONEachRowSchemaReader(ReadBuffer & in_, bool json_strings, const FormatSettings & format_settings);
private:
std::unordered_map<String, DataTypePtr> readRowAndGetNamesAndDataTypes() override;
bool json_strings;
bool first_row = true;
};
}

View File

@ -72,4 +72,13 @@ void registerInputFormatLineAsString(FormatFactory & factory)
return std::make_shared<LineAsStringRowInputFormat>(sample, buf, params);
});
}
void registerLineAsStringSchemaReader(FormatFactory & factory)
{
factory.registerExternalSchemaReader("LineAsString", [](
const FormatSettings &)
{
return std::make_shared<LinaAsStringSchemaReader>();
});
}
}

View File

@ -1,7 +1,9 @@
#pragma once
#include <Processors/Formats/IRowInputFormat.h>
#include <Processors/Formats/ISchemaReader.h>
#include <Formats/FormatFactory.h>
#include <DataTypes/DataTypeString.h>
namespace DB
{
@ -26,4 +28,13 @@ private:
void readLineObject(IColumn & column);
};
class LinaAsStringSchemaReader : public IExternalSchemaReader
{
public:
NamesAndTypesList readSchema() override
{
return {{"line", std::make_shared<DataTypeString>()}};
}
};
}

View File

@ -11,6 +11,7 @@
#include <DataTypes/DataTypeNullable.h>
#include <DataTypes/DataTypeMap.h>
#include <DataTypes/DataTypeLowCardinality.h>
#include <DataTypes/DataTypeString.h>
#include <Columns/ColumnArray.h>
#include <Columns/ColumnNullable.h>
@ -26,6 +27,8 @@ namespace ErrorCodes
{
extern const int ILLEGAL_COLUMN;
extern const int INCORRECT_DATA;
extern const int BAD_ARGUMENTS;
extern const int UNEXPECTED_END_OF_FILE;
}
MsgPackRowInputFormat::MsgPackRowInputFormat(const Block & header_, ReadBuffer & in_, Params params_)
@ -369,7 +372,108 @@ bool MsgPackRowInputFormat::readRow(MutableColumns & columns, RowReadExtension &
void MsgPackRowInputFormat::setReadBuffer(ReadBuffer & in_)
{
buf = std::make_unique<PeekableReadBuffer>(in_);
IInputFormat::setReadBuffer(*buf);
IInputFormat::setReadBuffer(in_);
}
MsgPackSchemaReader::MsgPackSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_)
: IRowSchemaReader(buf, format_settings_.max_rows_to_read_for_schema_inference), buf(in_), number_of_columns(format_settings_.msgpack.number_of_columns)
{
if (!number_of_columns)
throw Exception(ErrorCodes::BAD_ARGUMENTS, "You must specify setting input_format_msgpack_number_of_columns to extract table schema from MsgPack data");
}
msgpack::object_handle MsgPackSchemaReader::readObject()
{
if (buf.eof())
throw Exception(ErrorCodes::UNEXPECTED_END_OF_FILE, "Unexpected eof while parsing msgpack object");
PeekableReadBufferCheckpoint checkpoint{buf};
size_t offset = 0;
bool need_more_data = true;
msgpack::object_handle object_handle;
while (need_more_data)
{
offset = 0;
try
{
object_handle = msgpack::unpack(buf.position(), buf.buffer().end() - buf.position(), offset);
need_more_data = false;
}
catch (msgpack::insufficient_bytes &)
{
buf.position() = buf.buffer().end();
if (buf.eof())
throw Exception("Unexpected end of file while parsing msgpack object", ErrorCodes::UNEXPECTED_END_OF_FILE);
buf.position() = buf.buffer().end();
buf.makeContinuousMemoryFromCheckpointToPos();
buf.rollbackToCheckpoint();
}
}
buf.position() += offset;
return object_handle;
}
DataTypePtr MsgPackSchemaReader::getDataType(const msgpack::object & object)
{
switch (object.type)
{
case msgpack::type::object_type::POSITIVE_INTEGER: [[fallthrough]];
case msgpack::type::object_type::NEGATIVE_INTEGER:
return makeNullable(std::make_shared<DataTypeInt64>());
case msgpack::type::object_type::FLOAT32:
return makeNullable(std::make_shared<DataTypeFloat32>());
case msgpack::type::object_type::FLOAT64:
return makeNullable(std::make_shared<DataTypeFloat64>());
case msgpack::type::object_type::BOOLEAN:
return makeNullable(std::make_shared<DataTypeUInt8>());
case msgpack::type::object_type::BIN: [[fallthrough]];
case msgpack::type::object_type::STR:
return makeNullable(std::make_shared<DataTypeString>());
case msgpack::type::object_type::ARRAY:
{
msgpack::object_array object_array = object.via.array;
if (object_array.size)
{
auto nested_type = getDataType(object_array.ptr[0]);
if (nested_type)
return std::make_shared<DataTypeArray>(getDataType(object_array.ptr[0]));
}
return nullptr;
}
case msgpack::type::object_type::MAP:
{
msgpack::object_map object_map = object.via.map;
if (object_map.size)
{
auto key_type = removeNullable(getDataType(object_map.ptr[0].key));
auto value_type = getDataType(object_map.ptr[0].val);
if (key_type && value_type)
return std::make_shared<DataTypeMap>(key_type, value_type);
}
return nullptr;
}
case msgpack::type::object_type::NIL:
return nullptr;
default:
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Msgpack type is not supported");
}
}
DataTypes MsgPackSchemaReader::readRowAndGetDataTypes()
{
if (buf.eof())
return {};
DataTypes data_types;
data_types.reserve(number_of_columns);
for (size_t i = 0; i != number_of_columns; ++i)
{
auto object_handle = readObject();
data_types.push_back(getDataType(object_handle.get()));
}
return data_types;
}
void registerInputFormatMsgPack(FormatFactory & factory)
@ -384,6 +488,14 @@ void registerInputFormatMsgPack(FormatFactory & factory)
});
}
void registerMsgPackSchemaReader(FormatFactory & factory)
{
factory.registerSchemaReader("MsgPack", [](ReadBuffer & buf, const FormatSettings & settings, ContextPtr)
{
return std::make_shared<MsgPackSchemaReader>(buf, settings);
});
}
}
#else
@ -394,6 +506,10 @@ class FormatFactory;
void registerInputFormatMsgPack(FormatFactory &)
{
}
void registerMsgPackSchemaReader(FormatFactory &)
{
}
}
#endif

View File

@ -6,6 +6,7 @@
#if USE_MSGPACK
#include <Processors/Formats/IRowInputFormat.h>
#include <Processors/Formats/ISchemaReader.h>
#include <Formats/FormatFactory.h>
#include <IO/PeekableReadBuffer.h>
#include <msgpack.hpp>
@ -76,6 +77,20 @@ private:
const DataTypes data_types;
};
class MsgPackSchemaReader : public IRowSchemaReader
{
public:
MsgPackSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_);
private:
msgpack::object_handle readObject();
DataTypePtr getDataType(const msgpack::object & object);
DataTypes readRowAndGetDataTypes() override;
PeekableReadBuffer buf;
UInt64 number_of_columns;
};
}
#endif

View File

@ -1,8 +1,10 @@
#include <Formats/NativeReader.h>
#include <Formats/NativeWriter.h>
#include <Formats/FormatFactory.h>
#include <Processors/Formats/IInputFormat.h>
#include <Processors/Formats/IOutputFormat.h>
#include <Processors/Formats/ISchemaReader.h>
#include <Processors/Transforms/AggregatingTransform.h>
@ -82,6 +84,20 @@ private:
NativeWriter writer;
};
class NativeSchemaReader : public ISchemaReader
{
public:
explicit NativeSchemaReader(ReadBuffer & in_) : ISchemaReader(in_) {}
NamesAndTypesList readSchema() override
{
auto reader = NativeReader(in, 0);
auto block = reader.read();
return block.getNamesAndTypesList();
}
};
void registerInputFormatNative(FormatFactory & factory)
{
factory.registerInputFormat("Native", [](
@ -106,4 +122,14 @@ void registerOutputFormatNative(FormatFactory & factory)
});
}
void registerNativeSchemaReader(FormatFactory & factory)
{
factory.registerSchemaReader("Native", [](ReadBuffer & buf, const FormatSettings &, ContextPtr)
{
return std::make_shared<NativeSchemaReader>(buf);
});
}
}

View File

@ -87,9 +87,14 @@ static size_t countIndicesForType(std::shared_ptr<arrow::DataType> type)
return 1;
}
void ORCBlockInputFormat::prepareReader()
static void getFileReaderAndSchema(
ReadBuffer & in,
std::unique_ptr<arrow::adapters::orc::ORCFileReader> & file_reader,
std::shared_ptr<arrow::Schema> & schema,
const FormatSettings & format_settings,
std::atomic<int> & is_stopped)
{
auto arrow_file = asArrowFile(*in, format_settings, is_stopped);
auto arrow_file = asArrowFile(in, format_settings, is_stopped);
if (is_stopped)
return;
@ -101,7 +106,15 @@ void ORCBlockInputFormat::prepareReader()
auto read_schema_result = file_reader->ReadSchema();
if (!read_schema_result.ok())
throw Exception(read_schema_result.status().ToString(), ErrorCodes::BAD_ARGUMENTS);
std::shared_ptr<arrow::Schema> schema = std::move(read_schema_result).ValueOrDie();
schema = std::move(read_schema_result).ValueOrDie();
}
void ORCBlockInputFormat::prepareReader()
{
std::shared_ptr<arrow::Schema> schema;
getFileReaderAndSchema(*in, file_reader, schema, format_settings, is_stopped);
if (is_stopped)
return;
arrow_column_to_ch_column = std::make_unique<ArrowColumnToCHColumn>(getPort().getHeader(), "ORC", format_settings.orc.import_nested);
@ -128,7 +141,21 @@ void ORCBlockInputFormat::prepareReader()
}
}
void registerInputFormatORC(FormatFactory &factory)
ORCSchemaReader::ORCSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_) : ISchemaReader(in_), format_settings(format_settings_)
{
}
NamesAndTypesList ORCSchemaReader::readSchema()
{
std::unique_ptr<arrow::adapters::orc::ORCFileReader> file_reader;
std::shared_ptr<arrow::Schema> schema;
std::atomic<int> is_stopped = 0;
getFileReaderAndSchema(in, file_reader, schema, format_settings, is_stopped);
auto header = ArrowColumnToCHColumn::arrowSchemaToCHHeader(*schema, "ORC");
return header.getNamesAndTypesList();
}
void registerInputFormatORC(FormatFactory & factory)
{
factory.registerInputFormat(
"ORC",
@ -142,6 +169,17 @@ void registerInputFormatORC(FormatFactory &factory)
factory.markFormatAsColumnOriented("ORC");
}
void registerORCSchemaReader(FormatFactory & factory)
{
factory.registerSchemaReader(
"ORC",
[](ReadBuffer & buf, const FormatSettings & settings, ContextPtr)
{
return std::make_shared<ORCSchemaReader>(buf, settings);
}
);
}
}
#else
@ -151,6 +189,10 @@ namespace DB
void registerInputFormatORC(FormatFactory &)
{
}
void registerORCSchemaReader(FormatFactory &)
{
}
}
#endif

View File

@ -3,6 +3,7 @@
#if USE_ORC
#include <Processors/Formats/IInputFormat.h>
#include <Processors/Formats/ISchemaReader.h>
#include <Formats/FormatSettings.h>
#include <arrow/adapters/orc/adapter.h>
@ -54,5 +55,16 @@ private:
std::atomic<int> is_stopped{0};
};
class ORCSchemaReader : public ISchemaReader
{
public:
ORCSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_);
NamesAndTypesList readSchema() override;
private:
const FormatSettings format_settings;
};
}
#endif

View File

@ -87,6 +87,7 @@ ORC_UNIQUE_PTR<orc::Type> ORCBlockOutputFormat::getORCType(const DataTypePtr & t
{
return orc::createPrimitiveType(orc::TypeKind::DOUBLE);
}
case TypeIndex::Date32: [[fallthrough]];
case TypeIndex::Date:
{
return orc::createPrimitiveType(orc::TypeKind::DATE);
@ -292,6 +293,7 @@ void ORCBlockOutputFormat::writeColumn(
writeNumbers<UInt16, orc::LongVectorBatch>(orc_column, column, null_bytemap, [](const UInt16 & value){ return value; });
break;
}
case TypeIndex::Date32: [[fallthrough]];
case TypeIndex::Int32:
{
writeNumbers<Int32, orc::LongVectorBatch>(orc_column, column, null_bytemap, [](const Int32 & value){ return value; });

View File

@ -94,19 +94,30 @@ static size_t countIndicesForType(std::shared_ptr<arrow::DataType> type)
return 1;
}
static void getFileReaderAndSchema(
ReadBuffer & in,
std::unique_ptr<parquet::arrow::FileReader> & file_reader,
std::shared_ptr<arrow::Schema> & schema,
const FormatSettings & format_settings,
std::atomic<int> & is_stopped)
{
auto arrow_file = asArrowFile(in, format_settings, is_stopped);
if (is_stopped)
return;
THROW_ARROW_NOT_OK(parquet::arrow::OpenFile(std::move(arrow_file), arrow::default_memory_pool(), &file_reader));
THROW_ARROW_NOT_OK(file_reader->GetSchema(&schema));
}
void ParquetBlockInputFormat::prepareReader()
{
auto arrow_file = asArrowFile(*in, format_settings, is_stopped);
std::shared_ptr<arrow::Schema> schema;
getFileReaderAndSchema(*in, file_reader, schema, format_settings, is_stopped);
if (is_stopped)
return;
THROW_ARROW_NOT_OK(parquet::arrow::OpenFile(std::move(arrow_file), arrow::default_memory_pool(), &file_reader));
row_group_total = file_reader->num_row_groups();
row_group_current = 0;
std::shared_ptr<arrow::Schema> schema;
THROW_ARROW_NOT_OK(file_reader->GetSchema(&schema));
arrow_column_to_ch_column = std::make_unique<ArrowColumnToCHColumn>(getPort().getHeader(), "Parquet", format_settings.parquet.import_nested);
std::unordered_set<String> nested_table_names;
@ -130,7 +141,21 @@ void ParquetBlockInputFormat::prepareReader()
}
}
void registerInputFormatParquet(FormatFactory &factory)
ParquetSchemaReader::ParquetSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_) : ISchemaReader(in_), format_settings(format_settings_)
{
}
NamesAndTypesList ParquetSchemaReader::readSchema()
{
std::unique_ptr<parquet::arrow::FileReader> file_reader;
std::shared_ptr<arrow::Schema> schema;
std::atomic<int> is_stopped = 0;
getFileReaderAndSchema(in, file_reader, schema, format_settings, is_stopped);
auto header = ArrowColumnToCHColumn::arrowSchemaToCHHeader(*schema, "Parquet");
return header.getNamesAndTypesList();
}
void registerInputFormatParquet(FormatFactory & factory)
{
factory.registerInputFormat(
"Parquet",
@ -144,6 +169,17 @@ void registerInputFormatParquet(FormatFactory &factory)
factory.markFormatAsColumnOriented("Parquet");
}
void registerParquetSchemaReader(FormatFactory & factory)
{
factory.registerSchemaReader(
"Parquet",
[](ReadBuffer & buf, const FormatSettings & settings, ContextPtr)
{
return std::make_shared<ParquetSchemaReader>(buf, settings);
}
);
}
}
#else
@ -154,6 +190,8 @@ class FormatFactory;
void registerInputFormatParquet(FormatFactory &)
{
}
void registerParquetSchemaReader(FormatFactory &) {}
}
#endif

View File

@ -3,6 +3,7 @@
#if USE_PARQUET
#include <Processors/Formats/IInputFormat.h>
#include <Processors/Formats/ISchemaReader.h>
#include <Formats/FormatSettings.h>
namespace parquet::arrow { class FileReader; }
@ -44,6 +45,17 @@ private:
std::atomic<int> is_stopped{0};
};
class ParquetSchemaReader : public ISchemaReader
{
public:
ParquetSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_);
NamesAndTypesList readSchema() override;
private:
const FormatSettings format_settings;
};
}
#endif

View File

@ -73,6 +73,34 @@ void registerInputFormatProtobuf(FormatFactory & factory)
}
}
ProtobufSchemaReader::ProtobufSchemaReader(const FormatSettings & format_settings)
: schema_info(
format_settings.schema.format_schema,
"Protobuf",
true,
format_settings.schema.is_server,
format_settings.schema.format_schema_path)
{
}
NamesAndTypesList ProtobufSchemaReader::readSchema()
{
const auto * message_descriptor = ProtobufSchemas::instance().getMessageTypeForFormatSchema(schema_info);
return protobufSchemaToCHSchema(message_descriptor);
}
void registerProtobufSchemaReader(FormatFactory & factory)
{
factory.registerExternalSchemaReader("Protobuf", [](const FormatSettings & settings)
{
return std::make_shared<ProtobufSchemaReader>(settings);
});
factory.registerExternalSchemaReader("ProtobufSingle", [](const FormatSettings & settings)
{
return std::make_shared<ProtobufSchemaReader>(settings);
});
}
}
#else
@ -81,6 +109,8 @@ namespace DB
{
class FormatFactory;
void registerInputFormatProtobuf(FormatFactory &) {}
void registerProtobufSchemaReader(FormatFactory &) {}
}
#endif

View File

@ -3,7 +3,9 @@
#include "config_formats.h"
#if USE_PROTOBUF
# include <Formats/FormatSchemaInfo.h>
# include <Processors/Formats/IRowInputFormat.h>
# include <Processors/Formats/ISchemaReader.h>
namespace DB
{
@ -42,5 +44,16 @@ private:
std::unique_ptr<ProtobufSerializer> serializer;
};
class ProtobufSchemaReader : public IExternalSchemaReader
{
public:
explicit ProtobufSchemaReader(const FormatSettings & format_settings);
NamesAndTypesList readSchema() override;
private:
FormatSchemaInfo schema_info;
};
}
#endif

View File

@ -51,5 +51,14 @@ void registerInputFormatRawBLOB(FormatFactory & factory)
});
}
void registerRawBLOBSchemaReader(FormatFactory & factory)
{
factory.registerExternalSchemaReader("RawBLOB", [](
const FormatSettings &)
{
return std::make_shared<RawBLOBSchemaReader>();
});
}
}

View File

@ -1,6 +1,8 @@
#pragma once
#include <Processors/Formats/IRowInputFormat.h>
#include <Processors/Formats/ISchemaReader.h>
#include <DataTypes/DataTypeString.h>
namespace DB
@ -22,5 +24,14 @@ private:
bool readRow(MutableColumns & columns, RowReadExtension &) override;
};
class RawBLOBSchemaReader: public IExternalSchemaReader
{
public:
NamesAndTypesList readSchema() override
{
return {{"raw_blob", std::make_shared<DataTypeString>()}};
}
};
}

View File

@ -14,18 +14,7 @@ namespace ErrorCodes
extern const int LOGICAL_ERROR;
}
RegexpRowInputFormat::RegexpRowInputFormat(ReadBuffer & in_, const Block & header_, Params params_, const FormatSettings & format_settings_)
: RegexpRowInputFormat(std::make_unique<PeekableReadBuffer>(in_), header_, params_, format_settings_)
{
}
RegexpRowInputFormat::RegexpRowInputFormat(
std::unique_ptr<PeekableReadBuffer> buf_, const Block & header_, Params params_, const FormatSettings & format_settings_)
: IRowInputFormat(header_, *buf_, std::move(params_))
, buf(std::move(buf_))
, format_settings(format_settings_)
, escaping_rule(format_settings_.regexp.escaping_rule)
, regexp(format_settings_.regexp.regexp)
RegexpFieldExtractor::RegexpFieldExtractor(const FormatSettings & format_settings) : regexp(format_settings.regexp.regexp), skip_unmatched(format_settings.regexp.skip_unmatched)
{
size_t fields_count = regexp.NumberOfCapturingGroups();
matched_fields.resize(fields_count);
@ -40,6 +29,50 @@ RegexpRowInputFormat::RegexpRowInputFormat(
}
}
bool RegexpFieldExtractor::parseRow(PeekableReadBuffer & buf)
{
PeekableReadBufferCheckpoint checkpoint{buf};
size_t line_size = 0;
do
{
char * pos = find_first_symbols<'\n', '\r'>(buf.position(), buf.buffer().end());
line_size += pos - buf.position();
buf.position() = pos;
} while (buf.position() == buf.buffer().end() && !buf.eof());
buf.makeContinuousMemoryFromCheckpointToPos();
buf.rollbackToCheckpoint();
bool match = RE2::FullMatchN(re2::StringPiece(buf.position(), line_size), regexp, re2_arguments_ptrs.data(), re2_arguments_ptrs.size());
if (!match && !skip_unmatched)
throw Exception("Line \"" + std::string(buf.position(), line_size) + "\" doesn't match the regexp.", ErrorCodes::INCORRECT_DATA);
buf.position() += line_size;
checkChar('\r', buf);
if (!buf.eof() && !checkChar('\n', buf))
throw Exception("No \\n after \\r at the end of line.", ErrorCodes::INCORRECT_DATA);
return match;
}
RegexpRowInputFormat::RegexpRowInputFormat(
ReadBuffer & in_, const Block & header_, Params params_, const FormatSettings & format_settings_)
: RegexpRowInputFormat(std::make_unique<PeekableReadBuffer>(in_), header_, params_, format_settings_)
{
}
RegexpRowInputFormat::RegexpRowInputFormat(
std::unique_ptr<PeekableReadBuffer> buf_, const Block & header_, Params params_, const FormatSettings & format_settings_)
: IRowInputFormat(header_, *buf_, std::move(params_))
, buf(std::move(buf_))
, format_settings(format_settings_)
, escaping_rule(format_settings_.regexp.escaping_rule)
, field_extractor(RegexpFieldExtractor(format_settings_))
{
}
void RegexpRowInputFormat::resetParser()
{
@ -50,7 +83,8 @@ void RegexpRowInputFormat::resetParser()
bool RegexpRowInputFormat::readField(size_t index, MutableColumns & columns)
{
const auto & type = getPort().getHeader().getByPosition(index).type;
ReadBuffer field_buf(const_cast<char *>(matched_fields[index].data()), matched_fields[index].size(), 0);
auto matched_field = field_extractor.getField(index);
ReadBuffer field_buf(const_cast<char *>(matched_field.data()), matched_field.size(), 0);
try
{
return deserializeFieldByEscapingRule(type, serializations[index], *columns[index], field_buf, escaping_rule, format_settings);
@ -64,7 +98,7 @@ bool RegexpRowInputFormat::readField(size_t index, MutableColumns & columns)
void RegexpRowInputFormat::readFieldsFromMatch(MutableColumns & columns, RowReadExtension & ext)
{
if (matched_fields.size() != columns.size())
if (field_extractor.getMatchedFieldsSize() != columns.size())
throw Exception("The number of matched fields in line doesn't match the number of columns.", ErrorCodes::INCORRECT_DATA);
ext.read_columns.assign(columns.size(), false);
@ -79,39 +113,8 @@ bool RegexpRowInputFormat::readRow(MutableColumns & columns, RowReadExtension &
if (buf->eof())
return false;
PeekableReadBufferCheckpoint checkpoint{*buf};
size_t line_size = 0;
do
{
char * pos = find_first_symbols<'\n', '\r'>(buf->position(), buf->buffer().end());
line_size += pos - buf->position();
buf->position() = pos;
} while (buf->position() == buf->buffer().end() && !buf->eof());
buf->makeContinuousMemoryFromCheckpointToPos();
buf->rollbackToCheckpoint();
bool match = RE2::FullMatchN(re2::StringPiece(buf->position(), line_size), regexp, re2_arguments_ptrs.data(), re2_arguments_ptrs.size());
bool read_line = true;
if (!match)
{
if (!format_settings.regexp.skip_unmatched)
throw Exception("Line \"" + std::string(buf->position(), line_size) + "\" doesn't match the regexp.", ErrorCodes::INCORRECT_DATA);
read_line = false;
}
if (read_line)
if (field_extractor.parseRow(*buf))
readFieldsFromMatch(columns, ext);
buf->position() += line_size;
checkChar('\r', *buf);
if (!buf->eof() && !checkChar('\n', *buf))
throw Exception("No \\n after \\r at the end of line.", ErrorCodes::INCORRECT_DATA);
return true;
}
@ -121,6 +124,36 @@ void RegexpRowInputFormat::setReadBuffer(ReadBuffer & in_)
IInputFormat::setReadBuffer(*buf);
}
RegexpSchemaReader::RegexpSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_, ContextPtr context_)
: IRowSchemaReader(
buf,
format_settings_.max_rows_to_read_for_schema_inference,
getDefaultDataTypeForEscapingRule(format_settings_.regexp.escaping_rule))
, format_settings(format_settings_)
, field_extractor(format_settings)
, buf(in_)
, context(context_)
{
}
DataTypes RegexpSchemaReader::readRowAndGetDataTypes()
{
if (buf.eof())
return {};
field_extractor.parseRow(buf);
DataTypes data_types;
data_types.reserve(field_extractor.getMatchedFieldsSize());
for (size_t i = 0; i != field_extractor.getMatchedFieldsSize(); ++i)
{
String field(field_extractor.getField(i));
data_types.push_back(determineDataTypeByEscapingRule(field, format_settings, format_settings.regexp.escaping_rule, context));
}
return data_types;
}
void registerInputFormatRegexp(FormatFactory & factory)
{
factory.registerInputFormat("Regexp", [](
@ -172,4 +205,12 @@ void registerFileSegmentationEngineRegexp(FormatFactory & factory)
factory.registerFileSegmentationEngine("Regexp", &fileSegmentationEngineRegexpImpl);
}
void registerRegexpSchemaReader(FormatFactory & factory)
{
factory.registerSchemaReader("Regexp", [](ReadBuffer & buf, const FormatSettings & settings, ContextPtr context)
{
return std::make_shared<RegexpSchemaReader>(buf, settings, context);
});
}
}

View File

@ -6,6 +6,7 @@
#include <vector>
#include <Core/Block.h>
#include <Processors/Formats/IRowInputFormat.h>
#include <Processors/Formats/ISchemaReader.h>
#include <Formats/FormatSettings.h>
#include <Formats/FormatFactory.h>
#include <IO/PeekableReadBuffer.h>
@ -16,6 +17,29 @@ namespace DB
class ReadBuffer;
/// Class for extracting row fields from data by regexp.
class RegexpFieldExtractor
{
public:
RegexpFieldExtractor(const FormatSettings & format_settings);
/// Return true if row was successfully parsed and row fields were extracted.
bool parseRow(PeekableReadBuffer & buf);
re2::StringPiece getField(size_t index) { return matched_fields[index]; }
size_t getMatchedFieldsSize() const { return matched_fields.size(); }
size_t getNumberOfGroups() const { return regexp.NumberOfCapturingGroups(); }
private:
const RE2 regexp;
// The vector of fields extracted from line using regexp.
std::vector<re2::StringPiece> matched_fields;
// These two vectors are needed to use RE2::FullMatchN (function for extracting fields).
std::vector<RE2::Arg> re2_arguments;
std::vector<RE2::Arg *> re2_arguments_ptrs;
bool skip_unmatched;
};
/// Regexp input format.
/// This format applies regular expression from format_regexp setting for every line of file
/// (the lines must be separated by newline character ('\n') or DOS-style newline ("\r\n")).
@ -25,7 +49,6 @@ class ReadBuffer;
class RegexpRowInputFormat : public IRowInputFormat
{
using EscapingRule = FormatSettings::EscapingRule;
public:
RegexpRowInputFormat(ReadBuffer & in_, const Block & header_, Params params_, const FormatSettings & format_settings_);
@ -36,6 +59,8 @@ public:
private:
RegexpRowInputFormat(std::unique_ptr<PeekableReadBuffer> buf_, const Block & header_, Params params_, const FormatSettings & format_settings_);
using EscapingRule = FormatSettings::EscapingRule;
bool readRow(MutableColumns & columns, RowReadExtension & ext) override;
bool readField(size_t index, MutableColumns & columns);
@ -44,13 +69,22 @@ private:
std::unique_ptr<PeekableReadBuffer> buf;
const FormatSettings format_settings;
const EscapingRule escaping_rule;
RegexpFieldExtractor field_extractor;
};
const RE2 regexp;
// The vector of fields extracted from line using regexp.
std::vector<re2::StringPiece> matched_fields;
// These two vectors are needed to use RE2::FullMatchN (function for extracting fields).
std::vector<RE2::Arg> re2_arguments;
std::vector<RE2::Arg *> re2_arguments_ptrs;
class RegexpSchemaReader : public IRowSchemaReader
{
public:
RegexpSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings, ContextPtr context_);
private:
DataTypes readRowAndGetDataTypes() override;
using EscapingRule = FormatSettings::EscapingRule;
const FormatSettings format_settings;
RegexpFieldExtractor field_extractor;
PeekableReadBuffer buf;
ContextPtr context;
};
}

View File

@ -1,7 +1,10 @@
#include <IO/ReadHelpers.h>
#include <Processors/Formats/Impl/TSKVRowInputFormat.h>
#include <Formats/FormatFactory.h>
#include <Formats/EscapingRuleUtils.h>
#include <DataTypes/Serializations/SerializationNullable.h>
#include <DataTypes/DataTypeString.h>
#include <DataTypes/DataTypeNullable.h>
namespace DB
@ -211,6 +214,59 @@ void TSKVRowInputFormat::resetParser()
name_buf.clear();
}
TSKVSchemaReader::TSKVSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_)
: IRowWithNamesSchemaReader(
in_,
format_settings_.max_rows_to_read_for_schema_inference,
getDefaultDataTypeForEscapingRule(FormatSettings::EscapingRule::Escaped))
, format_settings(format_settings_)
{
}
std::unordered_map<String, DataTypePtr> TSKVSchemaReader::readRowAndGetNamesAndDataTypes()
{
if (first_row)
{
skipBOMIfExists(in);
first_row = false;
}
if (in.eof())
return {};
if (*in.position() == '\n')
{
++in.position();
return {};
}
std::unordered_map<String, DataTypePtr> names_and_types;
StringRef name_ref;
String name_tmp;
String value;
do
{
bool has_value = readName(in, name_ref, name_tmp);
if (has_value)
{
readEscapedString(value, in);
names_and_types[String(name_ref)] = determineDataTypeByEscapingRule(value, format_settings, FormatSettings::EscapingRule::Escaped);
}
else
{
/// The only thing that can go without value is `tskv` fragment that is ignored.
if (!(name_ref.size == 4 && 0 == memcmp(name_ref.data, "tskv", 4)))
throw Exception("Found field without value while parsing TSKV format: " + name_ref.toString(), ErrorCodes::INCORRECT_DATA);
}
}
while (checkChar('\t', in));
assertChar('\n', in);
return names_and_types;
}
void registerInputFormatTSKV(FormatFactory & factory)
{
factory.registerInputFormat("TSKV", [](
@ -222,5 +278,12 @@ void registerInputFormatTSKV(FormatFactory & factory)
return std::make_shared<TSKVRowInputFormat>(buf, sample, std::move(params), settings);
});
}
void registerTSKVSchemaReader(FormatFactory & factory)
{
factory.registerSchemaReader("TSKV", [](ReadBuffer & buf, const FormatSettings & settings, ContextPtr)
{
return std::make_shared<TSKVSchemaReader>(buf, settings);
});
}
}

View File

@ -2,6 +2,7 @@
#include <Core/Block.h>
#include <Processors/Formats/IRowInputFormat.h>
#include <Processors/Formats/ISchemaReader.h>
#include <Formats/FormatSettings.h>
#include <Common/HashTable/HashMap.h>
@ -52,4 +53,16 @@ private:
/// for row like ..., non-nullable column name=\N, ...
};
class TSKVSchemaReader : public IRowWithNamesSchemaReader
{
public:
TSKVSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_);
private:
std::unordered_map<String, DataTypePtr> readRowAndGetNamesAndDataTypes() override;
const FormatSettings format_settings;
bool first_row = true;
};
}

View File

@ -1,13 +1,15 @@
#include <IO/ReadHelpers.h>
#include <IO/Operators.h>
#include <IO/BufferWithOwnMemory.h>
#include <Processors/Formats/Impl/TabSeparatedRowInputFormat.h>
#include <Formats/verbosePrintString.h>
#include <Formats/FormatFactory.h>
#include <Formats/registerWithNamesAndTypes.h>
#include <DataTypes/DataTypeLowCardinality.h>
#include <DataTypes/DataTypeFactory.h>
#include <DataTypes/DataTypeNullable.h>
#include <DataTypes/Serializations/SerializationNullable.h>
#include <Formats/FormatFactory.h>
#include <Formats/ReadSchemaUtils.h>
#include <Formats/registerWithNamesAndTypes.h>
#include <Formats/verbosePrintString.h>
#include <Formats/EscapingRuleUtils.h>
#include <Processors/Formats/Impl/TabSeparatedRowInputFormat.h>
namespace DB
{
@ -38,40 +40,50 @@ TabSeparatedRowInputFormat::TabSeparatedRowInputFormat(
bool with_types_,
bool is_raw_,
const FormatSettings & format_settings_)
: RowInputFormatWithNamesAndTypes(header_, in_, params_, with_names_, with_types_, format_settings_), is_raw(is_raw_)
: RowInputFormatWithNamesAndTypes(header_, in_, params_, with_names_, with_types_, format_settings_, std::make_unique<TabSeparatedFormatReader>(in_, format_settings_, is_raw_))
{
}
void TabSeparatedRowInputFormat::skipFieldDelimiter()
TabSeparatedFormatReader::TabSeparatedFormatReader(ReadBuffer & in_, const FormatSettings & format_settings_, bool is_raw_)
: FormatWithNamesAndTypesReader(in_, format_settings_), is_raw(is_raw_)
{
}
void TabSeparatedFormatReader::skipFieldDelimiter()
{
assertChar('\t', *in);
}
void TabSeparatedRowInputFormat::skipRowEndDelimiter()
void TabSeparatedFormatReader::skipRowEndDelimiter()
{
if (in->eof())
return;
if (unlikely(row_num <= 1))
if (unlikely(first_row))
{
checkForCarriageReturn(*in);
first_row = false;
}
assertChar('\n', *in);
}
String TabSeparatedRowInputFormat::readFieldIntoString()
String TabSeparatedFormatReader::readFieldIntoString()
{
String field;
readEscapedString(field, *in);
if (is_raw)
readString(field, *in);
else
readEscapedString(field, *in);
return field;
}
void TabSeparatedRowInputFormat::skipField()
void TabSeparatedFormatReader::skipField()
{
NullOutput null_sink;
readEscapedStringInto(null_sink, *in);
readFieldIntoString();
}
void TabSeparatedRowInputFormat::skipHeaderRow()
void TabSeparatedFormatReader::skipHeaderRow()
{
do
{
@ -82,7 +94,7 @@ void TabSeparatedRowInputFormat::skipHeaderRow()
skipRowEndDelimiter();
}
std::vector<String> TabSeparatedRowInputFormat::readHeaderRow()
std::vector<String> TabSeparatedFormatReader::readRow()
{
std::vector<String> fields;
do
@ -95,7 +107,7 @@ std::vector<String> TabSeparatedRowInputFormat::readHeaderRow()
return fields;
}
bool TabSeparatedRowInputFormat::readField(IColumn & column, const DataTypePtr & type,
bool TabSeparatedFormatReader::readField(IColumn & column, const DataTypePtr & type,
const SerializationPtr & serialization, bool is_last_file_column, const String & /*column_name*/)
{
const bool at_delimiter = !is_last_file_column && !in->eof() && *in->position() == '\t';
@ -118,6 +130,7 @@ bool TabSeparatedRowInputFormat::readField(IColumn & column, const DataTypePtr &
return true;
}
if (as_nullable)
return SerializationNullable::deserializeTextEscapedImpl(column, *in, format_settings, serialization);
@ -125,7 +138,7 @@ bool TabSeparatedRowInputFormat::readField(IColumn & column, const DataTypePtr &
return true;
}
bool TabSeparatedRowInputFormat::parseFieldDelimiterWithDiagnosticInfo(WriteBuffer & out)
bool TabSeparatedFormatReader::parseFieldDelimiterWithDiagnosticInfo(WriteBuffer & out)
{
try
{
@ -156,7 +169,7 @@ bool TabSeparatedRowInputFormat::parseFieldDelimiterWithDiagnosticInfo(WriteBuff
return true;
}
bool TabSeparatedRowInputFormat::parseRowEndWithDiagnosticInfo(WriteBuffer & out)
bool TabSeparatedFormatReader::parseRowEndWithDiagnosticInfo(WriteBuffer & out)
{
if (in->eof())
return true;
@ -190,7 +203,7 @@ bool TabSeparatedRowInputFormat::parseRowEndWithDiagnosticInfo(WriteBuffer & out
return true;
}
void TabSeparatedRowInputFormat::checkNullValueForNonNullable(DataTypePtr type)
void TabSeparatedFormatReader::checkNullValueForNonNullable(DataTypePtr type)
{
bool can_be_parsed_as_null = type->isNullable() || type->isLowCardinalityNullable() || format_settings.null_as_default;
@ -218,6 +231,28 @@ void TabSeparatedRowInputFormat::syncAfterError()
skipToUnescapedNextLineOrEOF(*in);
}
TabSeparatedSchemaReader::TabSeparatedSchemaReader(
ReadBuffer & in_, bool with_names_, bool with_types_, bool is_raw_, const FormatSettings & format_settings_)
: FormatWithNamesAndTypesSchemaReader(
in_,
format_settings_.max_rows_to_read_for_schema_inference,
with_names_,
with_types_,
&reader,
getDefaultDataTypeForEscapingRule(is_raw_ ? FormatSettings::EscapingRule::Raw : FormatSettings::EscapingRule::Escaped))
, reader(in_, format_settings_, is_raw_)
{
}
DataTypes TabSeparatedSchemaReader::readRowAndGetDataTypes()
{
if (in.eof())
return {};
auto fields = reader.readRow();
return determineDataTypesByEscapingRule(fields, reader.getFormatSettings(), reader.getEscapingRule());
}
void registerInputFormatTabSeparated(FormatFactory & factory)
{
for (bool is_raw : {false, true})
@ -239,6 +274,23 @@ void registerInputFormatTabSeparated(FormatFactory & factory)
}
}
void registerTSVSchemaReader(FormatFactory & factory)
{
for (bool is_raw : {false, true})
{
auto register_func = [&](const String & format_name, bool with_names, bool with_types)
{
factory.registerSchemaReader(format_name, [with_names, with_types, is_raw](ReadBuffer & buf, const FormatSettings & settings, ContextPtr)
{
return std::make_shared<TabSeparatedSchemaReader>(buf, with_names, with_types, is_raw, settings);
});
};
registerWithNamesAndTypes(is_raw ? "TabSeparatedRaw" : "TabSeparated", register_func);
registerWithNamesAndTypes(is_raw ? "TSVRaw" : "TSV", register_func);
}
}
static std::pair<bool, size_t> fileSegmentationEngineTabSeparatedImpl(ReadBuffer & in, DB::Memory<> & memory, size_t min_chunk_size, bool is_raw, size_t min_rows)
{
bool need_more_data = true;

View File

@ -3,6 +3,7 @@
#include <Core/Block.h>
#include <Formats/FormatSettings.h>
#include <Processors/Formats/RowInputFormatWithNamesAndTypes.h>
#include <Processors/Formats/ISchemaReader.h>
namespace DB
@ -24,6 +25,13 @@ public:
private:
bool allowSyncAfterError() const override { return true; }
void syncAfterError() override;
bool isGarbageAfterField(size_t, ReadBuffer::Position pos) override { return *pos != '\n' && *pos != '\t'; }
};
class TabSeparatedFormatReader : public FormatWithNamesAndTypesReader
{
public:
TabSeparatedFormatReader(ReadBuffer & in_, const FormatSettings & format_settings, bool is_raw_);
bool readField(IColumn & column, const DataTypePtr & type,
const SerializationPtr & serialization, bool is_last_file_column, const String & column_name) override;
@ -36,18 +44,34 @@ private:
void skipFieldDelimiter() override;
void skipRowEndDelimiter() override;
std::vector<String> readHeaderRow();
std::vector<String> readNames() override { return readHeaderRow(); }
std::vector<String> readTypes() override { return readHeaderRow(); }
std::vector<String> readRow();
std::vector<String> readNames() override { return readRow(); }
std::vector<String> readTypes() override { return readRow(); }
String readFieldIntoString();
void checkNullValueForNonNullable(DataTypePtr type) override;
bool parseFieldDelimiterWithDiagnosticInfo(WriteBuffer & out) override;
bool parseRowEndWithDiagnosticInfo(WriteBuffer & out) override;
bool isGarbageAfterField(size_t, ReadBuffer::Position pos) override { return *pos != '\n' && *pos != '\t'; }
FormatSettings::EscapingRule getEscapingRule()
{
return is_raw ? FormatSettings::EscapingRule::Raw : FormatSettings::EscapingRule::Escaped;
}
private:
bool is_raw;
bool first_row = true;
};
class TabSeparatedSchemaReader : public FormatWithNamesAndTypesSchemaReader
{
public:
TabSeparatedSchemaReader(ReadBuffer & in_, bool with_names_, bool with_types_, bool is_raw_, const FormatSettings & format_settings);
private:
DataTypes readRowAndGetDataTypes() override;
TabSeparatedFormatReader reader;
};
}

View File

@ -22,7 +22,10 @@ void TabSeparatedRowOutputFormat::writeLine(const std::vector<String> & values)
{
for (size_t i = 0; i < values.size(); ++i)
{
writeEscapedString(values[i], out);
if (is_raw)
writeString(values[i], out);
else
writeEscapedString(values[i], out);
if (i + 1 == values.size())
writeRowEndDelimiter();
else

View File

@ -4,7 +4,6 @@
#include <Formats/EscapingRuleUtils.h>
#include <IO/Operators.h>
#include <DataTypes/DataTypeNothing.h>
#include <Interpreters/Context.h>
#include <DataTypes/Serializations/SerializationNullable.h>
namespace DB
@ -12,13 +11,19 @@ namespace DB
namespace ErrorCodes
{
extern const int ATTEMPT_TO_READ_AFTER_EOF;
extern const int CANNOT_READ_ALL_DATA;
extern const int CANNOT_PARSE_ESCAPE_SEQUENCE;
extern const int CANNOT_PARSE_QUOTED_STRING;
extern const int SYNTAX_ERROR;
extern const int ATTEMPT_TO_READ_AFTER_EOF;
extern const int CANNOT_READ_ALL_DATA;
extern const int CANNOT_PARSE_ESCAPE_SEQUENCE;
extern const int CANNOT_PARSE_QUOTED_STRING;
extern const int SYNTAX_ERROR;
}
[[noreturn]] static void throwUnexpectedEof(size_t row_num)
{
throw ParsingException("Unexpected EOF while parsing row " + std::to_string(row_num) + ". "
"Maybe last row has wrong format or input doesn't contain specified suffix before EOF.",
ErrorCodes::CANNOT_READ_ALL_DATA);
}
TemplateRowInputFormat::TemplateRowInputFormat(
const Block & header_,
@ -41,37 +46,13 @@ TemplateRowInputFormat::TemplateRowInputFormat(const Block & header_, std::uniqu
: RowInputFormatWithDiagnosticInfo(header_, *buf_, params_), buf(std::move(buf_)), data_types(header_.getDataTypes()),
settings(std::move(settings_)), ignore_spaces(ignore_spaces_),
format(std::move(format_)), row_format(std::move(row_format_)),
default_csv_delimiter(settings.csv.delimiter), row_between_delimiter(std::move(row_between_delimiter_))
default_csv_delimiter(settings.csv.delimiter), row_between_delimiter(row_between_delimiter_),
format_reader(std::make_unique<TemplateFormatReader>(*buf, ignore_spaces_, format, row_format, row_between_delimiter, settings))
{
/// Validate format string for result set
bool has_data = false;
for (size_t i = 0; i < format.columnsCount(); ++i)
{
if (format.format_idx_to_column_idx[i])
{
if (*format.format_idx_to_column_idx[i] != 0)
format.throwInvalidFormat("Invalid input part", i);
if (has_data)
format.throwInvalidFormat("${data} can occur only once", i);
if (format.escaping_rules[i] != EscapingRule::None)
format.throwInvalidFormat("${data} must have empty or None deserialization type", i);
has_data = true;
format_data_idx = i;
}
else
{
if (format.escaping_rules[i] == EscapingRule::XML)
format.throwInvalidFormat("XML deserialization is not supported", i);
}
}
/// Validate format string for rows
std::vector<UInt8> column_in_format(header_.columns(), false);
for (size_t i = 0; i < row_format.columnsCount(); ++i)
{
if (row_format.escaping_rules[i] == EscapingRule::XML)
row_format.throwInvalidFormat("XML deserialization is not supported", i);
if (row_format.format_idx_to_column_idx[i])
{
if (header_.columns() <= *row_format.format_idx_to_column_idx[i])
@ -94,69 +75,7 @@ TemplateRowInputFormat::TemplateRowInputFormat(const Block & header_, std::uniqu
void TemplateRowInputFormat::readPrefix()
{
size_t last_successfully_parsed_idx = 0;
try
{
tryReadPrefixOrSuffix<void>(last_successfully_parsed_idx, format_data_idx);
}
catch (Exception & e)
{
format.throwInvalidFormat(e.message() + " While parsing prefix", last_successfully_parsed_idx);
}
}
/// Asserts delimiters and skips fields in prefix or suffix.
/// tryReadPrefixOrSuffix<bool>(...) is used in checkForSuffix() to avoid throwing an exception after read of each row
/// (most likely false will be returned on first call of checkString(...))
template <typename ReturnType>
ReturnType TemplateRowInputFormat::tryReadPrefixOrSuffix(size_t & input_part_beg, size_t input_part_end)
{
static constexpr bool throw_exception = std::is_same_v<ReturnType, void>;
skipSpaces();
if constexpr (throw_exception)
assertString(format.delimiters[input_part_beg], *buf);
else
{
if (likely(!checkString(format.delimiters[input_part_beg], *buf)))
return ReturnType(false);
}
while (input_part_beg < input_part_end)
{
skipSpaces();
if constexpr (throw_exception)
skipField(format.escaping_rules[input_part_beg]);
else
{
try
{
skipField(format.escaping_rules[input_part_beg]);
}
catch (const Exception & e)
{
if (e.code() != ErrorCodes::ATTEMPT_TO_READ_AFTER_EOF &&
e.code() != ErrorCodes::CANNOT_PARSE_ESCAPE_SEQUENCE &&
e.code() != ErrorCodes::CANNOT_PARSE_QUOTED_STRING)
throw;
/// If it's parsing error, then suffix is not found
return ReturnType(false);
}
}
++input_part_beg;
skipSpaces();
if constexpr (throw_exception)
assertString(format.delimiters[input_part_beg], *buf);
else
{
if (likely(!checkString(format.delimiters[input_part_beg], *buf)))
return ReturnType(false);
}
}
if constexpr (!throw_exception)
return ReturnType(true);
format_reader->readPrefix();
}
bool TemplateRowInputFormat::readRow(MutableColumns & columns, RowReadExtension & extra)
@ -165,9 +84,7 @@ bool TemplateRowInputFormat::readRow(MutableColumns & columns, RowReadExtension
if (unlikely(end_of_stream))
return false;
skipSpaces();
if (unlikely(checkForSuffix()))
if (unlikely(format_reader->checkForSuffix()))
{
end_of_stream = true;
return false;
@ -176,27 +93,24 @@ bool TemplateRowInputFormat::readRow(MutableColumns & columns, RowReadExtension
updateDiagnosticInfo();
if (likely(row_num != 1))
assertString(row_between_delimiter, *buf);
format_reader->skipRowBetweenDelimiter();
extra.read_columns.assign(columns.size(), false);
for (size_t i = 0; i < row_format.columnsCount(); ++i)
{
skipSpaces();
assertString(row_format.delimiters[i], *buf);
skipSpaces();
format_reader->skipDelimiter(i);
if (row_format.format_idx_to_column_idx[i])
{
size_t col_idx = *row_format.format_idx_to_column_idx[i];
extra.read_columns[col_idx] = deserializeField(data_types[col_idx], serializations[col_idx], *columns[col_idx], i);
}
else
skipField(row_format.escaping_rules[i]);
format_reader->skipField(row_format.escaping_rules[i]);
}
skipSpaces();
assertString(row_format.delimiters.back(), *buf);
format_reader->skipRowEndDelimiter();
for (const auto & idx : always_default_columns)
data_types[idx]->insertDefaultInto(*columns[idx]);
@ -219,65 +133,21 @@ bool TemplateRowInputFormat::deserializeField(const DataTypePtr & type,
catch (Exception & e)
{
if (e.code() == ErrorCodes::ATTEMPT_TO_READ_AFTER_EOF)
throwUnexpectedEof();
throwUnexpectedEof(row_num);
throw;
}
}
void TemplateRowInputFormat::skipField(TemplateRowInputFormat::EscapingRule escaping_rule)
{
try
{
skipFieldByEscapingRule(*buf, escaping_rule, settings);
}
catch (Exception & e)
{
if (e.code() == ErrorCodes::ATTEMPT_TO_READ_AFTER_EOF)
throwUnexpectedEof();
throw;
}
}
/// Returns true if all rows have been read i.e. there are only suffix and spaces (if ignore_spaces == true) before EOF.
/// Otherwise returns false
bool TemplateRowInputFormat::checkForSuffix()
{
PeekableReadBufferCheckpoint checkpoint{*buf};
bool suffix_found = false;
size_t last_successfully_parsed_idx = format_data_idx + 1;
try
{
suffix_found = tryReadPrefixOrSuffix<bool>(last_successfully_parsed_idx, format.columnsCount());
}
catch (const Exception & e)
{
if (e.code() != ErrorCodes::ATTEMPT_TO_READ_AFTER_EOF &&
e.code() != ErrorCodes::CANNOT_PARSE_ESCAPE_SEQUENCE &&
e.code() != ErrorCodes::CANNOT_PARSE_QUOTED_STRING)
throw;
}
if (unlikely(suffix_found))
{
skipSpaces();
if (buf->eof())
return true;
}
buf->rollbackToCheckpoint();
return false;
}
bool TemplateRowInputFormat::parseRowAndPrintDiagnosticInfo(MutableColumns & columns, WriteBuffer & out)
{
out << "Suffix does not match: ";
size_t last_successfully_parsed_idx = format_data_idx + 1;
size_t last_successfully_parsed_idx = format_reader->getFormatDataIdx() + 1;
const ReadBuffer::Position row_begin_pos = buf->position();
bool caught = false;
try
{
PeekableReadBufferCheckpoint checkpoint{*buf, true};
tryReadPrefixOrSuffix<void>(last_successfully_parsed_idx, format.columnsCount());
format_reader->tryReadPrefixOrSuffix<void>(last_successfully_parsed_idx, format.columnsCount());
}
catch (Exception & e)
{
@ -309,7 +179,7 @@ bool TemplateRowInputFormat::parseRowAndPrintDiagnosticInfo(MutableColumns & col
if (!parseDelimiterWithDiagnosticInfo(out, *buf, row_format.delimiters[i], "delimiter before field " + std::to_string(i), ignore_spaces))
return false;
skipSpaces();
format_reader->skipSpaces();
if (row_format.format_idx_to_column_idx[i])
{
const auto & header = getPort().getHeader();
@ -364,7 +234,7 @@ void TemplateRowInputFormat::tryDeserializeField(const DataTypePtr & type, IColu
if (index)
deserializeField(type, serializations[*index], column, file_column);
else
skipField(row_format.escaping_rules[file_column]);
format_reader->skipField(row_format.escaping_rules[file_column]);
}
bool TemplateRowInputFormat::isGarbageAfterField(size_t, ReadBuffer::Position)
@ -387,13 +257,6 @@ void TemplateRowInputFormat::syncAfterError()
/// It will cause another parsing error.
}
void TemplateRowInputFormat::throwUnexpectedEof()
{
throw ParsingException("Unexpected EOF while parsing row " + std::to_string(row_num) + ". "
"Maybe last row has wrong format or input doesn't contain specified suffix before EOF.",
ErrorCodes::CANNOT_READ_ALL_DATA);
}
void TemplateRowInputFormat::resetParser()
{
RowInputFormatWithDiagnosticInfo::resetParser();
@ -407,6 +270,268 @@ void TemplateRowInputFormat::setReadBuffer(ReadBuffer & in_)
IInputFormat::setReadBuffer(*buf);
}
TemplateFormatReader::TemplateFormatReader(
PeekableReadBuffer & buf_,
bool ignore_spaces_,
const ParsedTemplateFormatString & format_,
const ParsedTemplateFormatString & row_format_,
std::string row_between_delimiter_,
const FormatSettings & format_settings_)
: buf(&buf_)
, ignore_spaces(ignore_spaces_)
, format(format_)
, row_format(row_format_)
, row_between_delimiter(row_between_delimiter_)
, format_settings(format_settings_)
{
/// Validate format string for result set
bool has_data = false;
for (size_t i = 0; i < format.columnsCount(); ++i)
{
if (format.format_idx_to_column_idx[i])
{
if (*format.format_idx_to_column_idx[i] != 0)
format.throwInvalidFormat("Invalid input part", i);
if (has_data)
format.throwInvalidFormat("${data} can occur only once", i);
if (format.escaping_rules[i] != EscapingRule::None)
format.throwInvalidFormat("${data} must have empty or None deserialization type", i);
has_data = true;
format_data_idx = i;
}
else
{
if (format.escaping_rules[i] == EscapingRule::XML)
format.throwInvalidFormat("XML deserialization is not supported", i);
}
}
/// Validate format string for rows
for (size_t i = 0; i < row_format.columnsCount(); ++i)
{
if (row_format.escaping_rules[i] == EscapingRule::XML)
row_format.throwInvalidFormat("XML deserialization is not supported", i);
}
}
void TemplateFormatReader::readPrefix()
{
size_t last_successfully_parsed_idx = 0;
try
{
tryReadPrefixOrSuffix<void>(last_successfully_parsed_idx, format_data_idx);
}
catch (Exception & e)
{
format.throwInvalidFormat(e.message() + " While parsing prefix", last_successfully_parsed_idx);
}
}
void TemplateFormatReader::skipField(EscapingRule escaping_rule)
{
try
{
skipFieldByEscapingRule(*buf, escaping_rule, format_settings);
}
catch (Exception & e)
{
if (e.code() == ErrorCodes::ATTEMPT_TO_READ_AFTER_EOF)
throwUnexpectedEof(row_num);
throw;
}
}
/// Asserts delimiters and skips fields in prefix or suffix.
/// tryReadPrefixOrSuffix<bool>(...) is used in checkForSuffix() to avoid throwing an exception after read of each row
/// (most likely false will be returned on first call of checkString(...))
template <typename ReturnType>
ReturnType TemplateFormatReader::tryReadPrefixOrSuffix(size_t & input_part_beg, size_t input_part_end)
{
static constexpr bool throw_exception = std::is_same_v<ReturnType, void>;
skipSpaces();
if constexpr (throw_exception)
assertString(format.delimiters[input_part_beg], *buf);
else
{
if (likely(!checkString(format.delimiters[input_part_beg], *buf)))
return ReturnType(false);
}
while (input_part_beg < input_part_end)
{
skipSpaces();
if constexpr (throw_exception)
skipField(format.escaping_rules[input_part_beg]);
else
{
try
{
skipField(format.escaping_rules[input_part_beg]);
}
catch (const Exception & e)
{
if (e.code() != ErrorCodes::ATTEMPT_TO_READ_AFTER_EOF &&
e.code() != ErrorCodes::CANNOT_PARSE_ESCAPE_SEQUENCE &&
e.code() != ErrorCodes::CANNOT_PARSE_QUOTED_STRING)
throw;
/// If it's parsing error, then suffix is not found
return ReturnType(false);
}
}
++input_part_beg;
skipSpaces();
if constexpr (throw_exception)
assertString(format.delimiters[input_part_beg], *buf);
else
{
if (likely(!checkString(format.delimiters[input_part_beg], *buf)))
return ReturnType(false);
}
}
if constexpr (!throw_exception)
return ReturnType(true);
}
/// Returns true if all rows have been read i.e. there are only suffix and spaces (if ignore_spaces == true) before EOF.
/// Otherwise returns false
bool TemplateFormatReader::checkForSuffix()
{
PeekableReadBufferCheckpoint checkpoint{*buf};
bool suffix_found = false;
size_t last_successfully_parsed_idx = format_data_idx + 1;
try
{
suffix_found = tryReadPrefixOrSuffix<bool>(last_successfully_parsed_idx, format.columnsCount());
}
catch (const Exception & e)
{
if (e.code() != ErrorCodes::ATTEMPT_TO_READ_AFTER_EOF &&
e.code() != ErrorCodes::CANNOT_PARSE_ESCAPE_SEQUENCE &&
e.code() != ErrorCodes::CANNOT_PARSE_QUOTED_STRING)
throw;
}
if (unlikely(suffix_found))
{
skipSpaces();
if (buf->eof())
return true;
}
buf->rollbackToCheckpoint();
return false;
}
void TemplateFormatReader::skipDelimiter(size_t index)
{
skipSpaces();
assertString(row_format.delimiters[index], *buf);
skipSpaces();
}
void TemplateFormatReader::skipRowEndDelimiter()
{
++row_num;
skipSpaces();
assertString(row_format.delimiters.back(), *buf);
skipSpaces();
}
void TemplateFormatReader::skipRowBetweenDelimiter()
{
skipSpaces();
assertString(row_between_delimiter, *buf);
skipSpaces();
}
TemplateSchemaReader::TemplateSchemaReader(
ReadBuffer & in_,
bool ignore_spaces_,
const ParsedTemplateFormatString & format_,
const ParsedTemplateFormatString & row_format_,
std::string row_between_delimiter,
const FormatSettings & format_settings_,
ContextPtr context_)
: IRowSchemaReader(buf, format_settings_.max_rows_to_read_for_schema_inference)
, buf(in_)
, format(format_)
, row_format(row_format_)
, format_settings(format_settings_)
, context(context_)
, format_reader(buf, ignore_spaces_, format, row_format, row_between_delimiter, format_settings)
{
setColumnNames(row_format.column_names);
}
DataTypes TemplateSchemaReader::readRowAndGetDataTypes()
{
if (first_row)
format_reader.readPrefix();
if (format_reader.checkForSuffix())
return {};
if (first_row)
first_row = false;
else
format_reader.skipRowBetweenDelimiter();
DataTypes data_types;
data_types.reserve(row_format.columnsCount());
String field;
for (size_t i = 0; i != row_format.columnsCount(); ++i)
{
format_reader.skipDelimiter(i);
if (row_format.escaping_rules[i] == FormatSettings::EscapingRule::CSV)
format_settings.csv.delimiter = row_format.delimiters[i + 1].empty() ? format_settings.csv.delimiter : row_format.delimiters[i + 1].front();
field = readFieldByEscapingRule(buf, row_format.escaping_rules[i], format_settings);
data_types.push_back(determineDataTypeByEscapingRule(field, format_settings, row_format.escaping_rules[i], context));
}
format_reader.skipRowEndDelimiter();
return data_types;
}
static ParsedTemplateFormatString fillResultSetFormat(const FormatSettings & settings)
{
ParsedTemplateFormatString resultset_format;
if (settings.template_settings.resultset_format.empty())
{
/// Default format string: "${data}"
resultset_format.delimiters.resize(2);
resultset_format.escaping_rules.emplace_back(ParsedTemplateFormatString::EscapingRule::None);
resultset_format.format_idx_to_column_idx.emplace_back(0);
resultset_format.column_names.emplace_back("data");
}
else
{
/// Read format string from file
resultset_format = ParsedTemplateFormatString(
FormatSchemaInfo(settings.template_settings.resultset_format, "Template", false,
settings.schema.is_server, settings.schema.format_schema_path),
[&](const String & partName) -> std::optional<size_t>
{
if (partName == "data")
return 0;
throw Exception("Unknown input part " + partName,
ErrorCodes::SYNTAX_ERROR);
});
}
return resultset_format;
}
static ParsedTemplateFormatString fillRowFormat(const FormatSettings & settings, ParsedTemplateFormatString::ColumnIdxGetter idx_getter, bool allow_indexes)
{
return ParsedTemplateFormatString(
FormatSchemaInfo(
settings.template_settings.row_format, "Template", false, settings.schema.is_server, settings.schema.format_schema_path),
idx_getter, allow_indexes);
}
void registerInputFormatTemplate(FormatFactory & factory)
{
for (bool ignore_spaces : {false, true})
@ -417,39 +542,34 @@ void registerInputFormatTemplate(FormatFactory & factory)
IRowInputFormat::Params params,
const FormatSettings & settings)
{
ParsedTemplateFormatString resultset_format;
if (settings.template_settings.resultset_format.empty())
auto idx_getter = [&](const String & colName) -> std::optional<size_t>
{
/// Default format string: "${data}"
resultset_format.delimiters.resize(2);
resultset_format.escaping_rules.emplace_back(ParsedTemplateFormatString::EscapingRule::None);
resultset_format.format_idx_to_column_idx.emplace_back(0);
resultset_format.column_names.emplace_back("data");
}
else
{
/// Read format string from file
resultset_format = ParsedTemplateFormatString(
FormatSchemaInfo(settings.template_settings.resultset_format, "Template", false,
settings.schema.is_server, settings.schema.format_schema_path),
[&](const String & partName) -> std::optional<size_t>
{
if (partName == "data")
return 0;
throw Exception("Unknown input part " + partName,
ErrorCodes::SYNTAX_ERROR);
});
}
return sample.getPositionByName(colName);
};
ParsedTemplateFormatString row_format = ParsedTemplateFormatString(
FormatSchemaInfo(settings.template_settings.row_format, "Template", false,
settings.schema.is_server, settings.schema.format_schema_path),
[&](const String & colName) -> std::optional<size_t>
{
return sample.getPositionByName(colName);
});
return std::make_shared<TemplateRowInputFormat>(
sample,
buf,
params,
settings,
ignore_spaces,
fillResultSetFormat(settings),
fillRowFormat(settings, idx_getter, true),
settings.template_settings.row_between_delimiter);
});
}
}
return std::make_shared<TemplateRowInputFormat>(sample, buf, params, settings, ignore_spaces, resultset_format, row_format, settings.template_settings.row_between_delimiter);
void registerTemplateSchemaReader(FormatFactory & factory)
{
for (bool ignore_spaces : {false, true})
{
factory.registerSchemaReader(ignore_spaces ? "TemplateIgnoreSpaces" : "Template", [ignore_spaces](ReadBuffer & buf, const FormatSettings & settings, ContextPtr context)
{
size_t index = 0;
auto idx_getter = [&](const String &) -> std::optional<size_t> { return index++; };
auto row_format = fillRowFormat(settings, idx_getter, false);
return std::make_shared<TemplateSchemaReader>(buf, ignore_spaces, fillResultSetFormat(settings), row_format, settings.template_settings.row_between_delimiter, settings, context);
});
}
}

View File

@ -2,15 +2,19 @@
#include <Core/Block.h>
#include <Processors/Formats/RowInputFormatWithDiagnosticInfo.h>
#include <Processors/Formats/ISchemaReader.h>
#include <Formats/FormatSettings.h>
#include <Formats/ParsedTemplateFormatString.h>
#include <IO/ReadHelpers.h>
#include <IO/PeekableReadBuffer.h>
#include <Interpreters/Context.h>
namespace DB
{
class TemplateFormatReader;
class TemplateRowInputFormat : public RowInputFormatWithDiagnosticInfo
{
using EscapingRule = FormatSettings::EscapingRule;
@ -40,14 +44,6 @@ private:
bool deserializeField(const DataTypePtr & type,
const SerializationPtr & serialization, IColumn & column, size_t file_column);
void skipField(EscapingRule escaping_rule);
inline void skipSpaces() { if (ignore_spaces) skipWhitespaceIfAny(*buf); }
template <typename ReturnType = void>
ReturnType tryReadPrefixOrSuffix(size_t & input_part_beg, size_t input_part_end);
bool checkForSuffix();
[[noreturn]] void throwUnexpectedEof();
bool parseRowAndPrintDiagnosticInfo(MutableColumns & columns, WriteBuffer & out) override;
void tryDeserializeField(const DataTypePtr & type, IColumn & column, size_t file_column) override;
@ -63,12 +59,76 @@ private:
const ParsedTemplateFormatString format;
const ParsedTemplateFormatString row_format;
size_t format_data_idx;
bool end_of_stream = false;
std::vector<size_t> always_default_columns;
const char default_csv_delimiter;
const std::string row_between_delimiter;
std::unique_ptr<TemplateFormatReader> format_reader;
};
class TemplateFormatReader
{
using EscapingRule = FormatSettings::EscapingRule;
public:
TemplateFormatReader(
PeekableReadBuffer & buf_,
bool ignore_spaces_,
const ParsedTemplateFormatString & format_,
const ParsedTemplateFormatString & row_format_,
std::string row_between_delimiter,
const FormatSettings & format_settings_);
void readPrefix();
void skipField(EscapingRule escaping_rule);
inline void skipSpaces() { if (ignore_spaces) skipWhitespaceIfAny(*buf); }
template <typename ReturnType = void>
ReturnType tryReadPrefixOrSuffix(size_t & input_part_beg, size_t input_part_end);
bool checkForSuffix();
void setReadBuffer(PeekableReadBuffer & buf_) { buf = &buf_; }
void skipDelimiter(size_t index);
void skipRowEndDelimiter();
void skipRowBetweenDelimiter();
size_t getFormatDataIdx() const { return format_data_idx; }
private:
PeekableReadBuffer * buf;
bool ignore_spaces;
const ParsedTemplateFormatString & format;
const ParsedTemplateFormatString & row_format;
const std::string row_between_delimiter;
const FormatSettings & format_settings;
size_t format_data_idx;
size_t row_num;
};
class TemplateSchemaReader : public IRowSchemaReader
{
public:
TemplateSchemaReader(ReadBuffer & in_,
bool ignore_spaces_,
const ParsedTemplateFormatString & format_,
const ParsedTemplateFormatString & row_format_,
std::string row_between_delimiter,
const FormatSettings & format_settings_,
ContextPtr context_);
DataTypes readRowAndGetDataTypes() override;
private:
PeekableReadBuffer buf;
const ParsedTemplateFormatString format;
const ParsedTemplateFormatString row_format;
FormatSettings format_settings;
ContextPtr context;
TemplateFormatReader format_reader;
bool first_row = true;
};
bool parseDelimiterWithDiagnosticInfo(WriteBuffer & out, ReadBuffer & buf, const String & delimiter, const String & description, bool skip_spaces);

View File

@ -5,6 +5,7 @@
#include <Parsers/TokenIterator.h>
#include <Processors/Formats/Impl/ValuesBlockInputFormat.h>
#include <Formats/FormatFactory.h>
#include <Formats/ReadSchemaUtils.h>
#include <Core/Block.h>
#include <base/find_symbols.h>
#include <Common/typeid_cast.h>
@ -15,6 +16,7 @@
#include <DataTypes/DataTypeArray.h>
#include <DataTypes/DataTypeMap.h>
#include <base/logger_useful.h>
namespace DB
{
@ -286,6 +288,50 @@ namespace
}
}
/// Can be used in fileSegmentationEngine for parallel parsing of Values
static bool skipToNextRow(PeekableReadBuffer * buf, size_t min_chunk_bytes, int balance)
{
skipWhitespaceIfAny(*buf);
if (buf->eof() || *buf->position() == ';')
return false;
bool quoted = false;
size_t chunk_begin_buf_count = buf->count();
while (!buf->eof() && (balance || buf->count() - chunk_begin_buf_count < min_chunk_bytes))
{
buf->position() = find_first_symbols<'\\', '\'', ')', '('>(buf->position(), buf->buffer().end());
if (buf->position() == buf->buffer().end())
continue;
if (*buf->position() == '\\')
{
++buf->position();
if (!buf->eof())
++buf->position();
}
else if (*buf->position() == '\'')
{
quoted ^= true;
++buf->position();
}
else if (*buf->position() == ')')
{
++buf->position();
if (!quoted)
--balance;
}
else if (*buf->position() == '(')
{
++buf->position();
if (!quoted)
++balance;
}
}
if (!buf->eof() && *buf->position() == ',')
++buf->position();
return true;
}
bool ValuesBlockInputFormat::parseExpression(IColumn & column, size_t column_idx)
{
const Block & header = getPort().getHeader();
@ -293,7 +339,7 @@ bool ValuesBlockInputFormat::parseExpression(IColumn & column, size_t column_idx
auto settings = context->getSettingsRef();
/// We need continuous memory containing the expression to use Lexer
skipToNextRow(0, 1);
skipToNextRow(buf.get(), 0, 1);
buf->makeContinuousMemoryFromCheckpointToPos();
buf->rollbackToCheckpoint();
@ -437,50 +483,6 @@ bool ValuesBlockInputFormat::parseExpression(IColumn & column, size_t column_idx
return true;
}
/// Can be used in fileSegmentationEngine for parallel parsing of Values
bool ValuesBlockInputFormat::skipToNextRow(size_t min_chunk_bytes, int balance)
{
skipWhitespaceIfAny(*buf);
if (buf->eof() || *buf->position() == ';')
return false;
bool quoted = false;
size_t chunk_begin_buf_count = buf->count();
while (!buf->eof() && (balance || buf->count() - chunk_begin_buf_count < min_chunk_bytes))
{
buf->position() = find_first_symbols<'\\', '\'', ')', '('>(buf->position(), buf->buffer().end());
if (buf->position() == buf->buffer().end())
continue;
if (*buf->position() == '\\')
{
++buf->position();
if (!buf->eof())
++buf->position();
}
else if (*buf->position() == '\'')
{
quoted ^= true;
++buf->position();
}
else if (*buf->position() == ')')
{
++buf->position();
if (!quoted)
--balance;
}
else if (*buf->position() == '(')
{
++buf->position();
if (!quoted)
++balance;
}
}
if (!buf->eof() && *buf->position() == ',')
++buf->position();
return true;
}
void ValuesBlockInputFormat::assertDelimiterAfterValue(size_t column_idx)
{
if (unlikely(!checkDelimiterAfterValue(column_idx)))
@ -559,6 +561,63 @@ void ValuesBlockInputFormat::setReadBuffer(ReadBuffer & in_)
IInputFormat::setReadBuffer(*buf);
}
ValuesSchemaReader::ValuesSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_, ContextPtr context_)
: IRowSchemaReader(buf, format_settings_.max_rows_to_read_for_schema_inference), buf(in_), context(context_)
{
}
DataTypes ValuesSchemaReader::readRowAndGetDataTypes()
{
if (first_row)
{
skipBOMIfExists(buf);
first_row = false;
}
skipWhitespaceIfAny(buf);
if (buf.eof())
return {};
assertChar('(', buf);
PeekableReadBufferCheckpoint checkpoint(buf);
skipToNextRow(&buf, 0, 1);
buf.makeContinuousMemoryFromCheckpointToPos();
buf.rollbackToCheckpoint();
Tokens tokens(buf.position(), buf.buffer().end());
IParser::Pos token_iterator(tokens, context->getSettingsRef().max_parser_depth);
DataTypes data_types;
bool finish = false;
while (!finish)
{
Expected expected;
ASTPtr ast;
bool parsed = parser.parse(token_iterator, ast, expected);
/// Consider delimiter after value (',' or ')') as part of expression
parsed &= token_iterator->type == TokenType::Comma || token_iterator->type == TokenType::ClosingRoundBracket;
if (!parsed)
throw Exception(ErrorCodes::SYNTAX_ERROR, "Cannot parse expression here: {}, token: {}",
String(buf.position(), std::min(SHOW_CHARS_ON_SYNTAX_ERROR, buf.buffer().end() - buf.position())), String(token_iterator.get().begin, token_iterator.get().end));
std::pair<Field, DataTypePtr> result = evaluateConstantExpression(ast, context);
data_types.push_back(generalizeDataType(result.second));
if (token_iterator->type == TokenType::ClosingRoundBracket)
finish = true;
++token_iterator;
buf.position() = const_cast<char *>(token_iterator->begin);
}
skipWhitespaceIfAny(buf);
if (!buf.eof() && *buf.position() == ',')
++buf.position();
return data_types;
}
void registerInputFormatValues(FormatFactory & factory)
{
factory.registerInputFormat("Values", [](
@ -571,4 +630,12 @@ void registerInputFormatValues(FormatFactory & factory)
});
}
void registerValuesSchemaReader(FormatFactory & factory)
{
factory.registerSchemaReader("Values", [](ReadBuffer & buf, const FormatSettings & settings, ContextPtr context)
{
return std::make_shared<ValuesSchemaReader>(buf, settings, context);
});
}
}

Some files were not shown because too many files have changed in this diff Show More