Merge pull request #1587 from rlipovsky/geodata_readers

[clickhouse-yt] separate geoexport format readers from data files
This commit is contained in:
alexey-milovidov 2017-12-05 00:18:54 +03:00 committed by GitHub
commit 20a8812eba
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 132 additions and 111 deletions

View File

@ -1,82 +1,13 @@
#include <Dictionaries/Embedded/GeodataProviders/HierarchiesProvider.h>
#include <Dictionaries/Embedded/GeodataProviders/HierarchyFormatReader.h>
#include <IO/ReadBufferFromFile.h>
#include <IO/ReadHelpers.h>
#include <IO/WriteHelpers.h>
#include <Poco/Util/Application.h>
#include <Poco/Exception.h>
#include <Poco/File.h>
#include <Poco/DirectoryIterator.h>
class RegionsHierarchyFileReader : public IRegionsHierarchyReader
{
private:
DB::ReadBufferFromFile in;
public:
RegionsHierarchyFileReader(const std::string & path)
: in(path)
{}
bool readNext(RegionEntry & entry) override;
};
bool RegionsHierarchyFileReader::readNext(RegionEntry & entry)
{
while (!in.eof())
{
/** Our internal geobase has negative numbers,
* that means "this is garbage, ignore this row".
*/
Int32 read_region_id = 0;
Int32 read_parent_id = 0;
Int8 read_type = 0;
DB::readIntText(read_region_id, in);
DB::assertChar('\t', in);
DB::readIntText(read_parent_id, in);
DB::assertChar('\t', in);
DB::readIntText(read_type, in);
/** Then there can be a newline (old version)
* or tab, the region's population, line feed (new version).
*/
RegionPopulation population = 0;
if (!in.eof() && *in.position() == '\t')
{
++in.position();
UInt64 population_big = 0;
DB::readIntText(population_big, in);
population = population_big > std::numeric_limits<RegionPopulation>::max()
? std::numeric_limits<RegionPopulation>::max()
: population_big;
}
DB::assertChar('\n', in);
if (read_region_id <= 0 || read_type < 0)
continue;
RegionID region_id = read_region_id;
RegionID parent_id = 0;
if (read_parent_id >= 0)
parent_id = read_parent_id;
RegionType type = static_cast<RegionType>(read_type);
entry.id = region_id;
entry.parent_id = parent_id;
entry.type = type;
entry.population = population;
return true;
}
return false;
}
bool RegionsHierarchyDataSource::isModified() const
{
return updates_tracker.isModified();
@ -85,7 +16,8 @@ bool RegionsHierarchyDataSource::isModified() const
IRegionsHierarchyReaderPtr RegionsHierarchyDataSource::createReader()
{
updates_tracker.fixCurrentVersion();
return std::make_unique<RegionsHierarchyFileReader>(path);
auto file_reader = std::make_shared<DB::ReadBufferFromFile>(path);
return std::make_unique<RegionsHierarchyFormatReader>(std::move(file_reader));
}

View File

@ -0,0 +1,58 @@
#include <Dictionaries/Embedded/GeodataProviders/HierarchyFormatReader.h>
#include <IO/ReadHelpers.h>
#include <IO/WriteHelpers.h>
bool RegionsHierarchyFormatReader::readNext(RegionEntry & entry)
{
while (!input->eof())
{
/** Our internal geobase has negative numbers,
* that means "this is garbage, ignore this row".
*/
Int32 read_region_id = 0;
Int32 read_parent_id = 0;
Int8 read_type = 0;
DB::readIntText(read_region_id, *input);
DB::assertChar('\t', *input);
DB::readIntText(read_parent_id, *input);
DB::assertChar('\t', *input);
DB::readIntText(read_type, *input);
/** Then there can be a newline (old version)
* or tab, the region's population, line feed (new version).
*/
RegionPopulation population = 0;
if (!input->eof() && *input->position() == '\t')
{
++input->position();
UInt64 population_big = 0;
DB::readIntText(population_big, *input);
population = population_big > std::numeric_limits<RegionPopulation>::max()
? std::numeric_limits<RegionPopulation>::max()
: population_big;
}
DB::assertChar('\n', *input);
if (read_region_id <= 0 || read_type < 0)
continue;
RegionID region_id = read_region_id;
RegionID parent_id = 0;
if (read_parent_id >= 0)
parent_id = read_parent_id;
RegionType type = static_cast<RegionType>(read_type);
entry.id = region_id;
entry.parent_id = parent_id;
entry.type = type;
entry.population = population;
return true;
}
return false;
}

View File

@ -0,0 +1,21 @@
#pragma once
#include <Dictionaries/Embedded/GeodataProviders/IHierarchiesProvider.h>
#include <IO/ReadBuffer.h>
// Reads regions hierarchy in geoexport format
class RegionsHierarchyFormatReader : public IRegionsHierarchyReader
{
private:
DB::ReadBufferPtr input;
public:
RegionsHierarchyFormatReader(DB::ReadBufferPtr input_)
: input(std::move(input_))
{}
bool readNext(RegionEntry & entry) override;
};

View File

@ -0,0 +1,27 @@
#include <Dictionaries/Embedded/GeodataProviders/NamesFormatReader.h>
#include <IO/ReadHelpers.h>
bool LanguageRegionsNamesFormatReader::readNext(RegionNameEntry & entry)
{
while (!input->eof())
{
Int32 read_region_id;
std::string region_name;
DB::readIntText(read_region_id, *input);
DB::assertChar('\t', *input);
DB::readString(region_name, *input);
DB::assertChar('\n', *input);
if (read_region_id <= 0)
continue;
entry.id = read_region_id;
entry.name = region_name;
return true;
}
return false;
}

View File

@ -0,0 +1,20 @@
#pragma once
#include <Dictionaries/Embedded/GeodataProviders/INamesProvider.h>
#include <IO/ReadBuffer.h>
// Reads regions names list in geoexport format
class LanguageRegionsNamesFormatReader : public ILanguageRegionsNamesReader
{
private:
DB::ReadBufferPtr input;
public:
LanguageRegionsNamesFormatReader(DB::ReadBufferPtr input_)
: input(std::move(input_))
{}
bool readNext(RegionNameEntry & entry) override;
};

View File

@ -1,45 +1,7 @@
#include <Dictionaries/Embedded/GeodataProviders/NamesProvider.h>
#include <Dictionaries/Embedded/GeodataProviders/NamesFormatReader.h>
#include <IO/ReadBufferFromFile.h>
#include <IO/ReadHelpers.h>
#include <IO/WriteHelpers.h>
class LanguageRegionsNamesFileReader : public ILanguageRegionsNamesReader
{
private:
DB::ReadBufferFromFile in;
public:
LanguageRegionsNamesFileReader(const std::string & path)
: in(path)
{}
bool readNext(RegionNameEntry & entry) override;
};
bool LanguageRegionsNamesFileReader::readNext(RegionNameEntry & entry)
{
while (!in.eof())
{
Int32 read_region_id;
std::string region_name;
DB::readIntText(read_region_id, in);
DB::assertChar('\t', in);
DB::readString(region_name, in);
DB::assertChar('\n', in);
if (read_region_id <= 0)
continue;
entry.id = read_region_id;
entry.name = region_name;
return true;
}
return false;
}
bool LanguageRegionsNamesDataSource::isModified() const
@ -55,7 +17,8 @@ size_t LanguageRegionsNamesDataSource::estimateTotalSize() const
ILanguageRegionsNamesReaderPtr LanguageRegionsNamesDataSource::createReader()
{
updates_tracker.fixCurrentVersion();
return std::make_unique<LanguageRegionsNamesFileReader>(path);
auto file_reader = std::make_shared<DB::ReadBufferFromFile>(path);
return std::make_unique<LanguageRegionsNamesFormatReader>(std::move(file_reader));
}
std::string LanguageRegionsNamesDataSource::getLanguage() const