mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-21 23:21:59 +00:00
more fixes after rebase
This commit is contained in:
parent
ce1e2b18a3
commit
f36e8f13c0
@ -98,7 +98,7 @@ private:
|
||||
|
||||
|
||||
/// Special class of exceptions, used mostly in ParallelParsingInputFormat for
|
||||
/// more convinient calculation of problem line number.
|
||||
/// more convenient calculation of problem line number.
|
||||
class ParsingException : public Exception
|
||||
{
|
||||
public:
|
||||
|
@ -61,7 +61,7 @@ struct ExtractStringImpl
|
||||
// read a ASCII word
|
||||
static ALWAYS_INLINE inline size_t readOneASCIIWord(PaddedPODArray<UInt8> & word_buf, const char *& pos, const char * end)
|
||||
{
|
||||
// jump seperators
|
||||
// jump separators
|
||||
while (pos < end && !isAlphaNumericASCII(*pos))
|
||||
++pos;
|
||||
|
||||
@ -93,7 +93,7 @@ struct ExtractStringImpl
|
||||
// read one UTF8 word from pos to word
|
||||
static ALWAYS_INLINE inline size_t readOneUTF8Word(PaddedPODArray<UInt32> & word_buf, const char *& pos, const char * end)
|
||||
{
|
||||
// jump UTF8 seperator
|
||||
// jump UTF8 separator
|
||||
while (pos < end && isUTF8Sep(*pos))
|
||||
++pos;
|
||||
word_buf.clear();
|
||||
@ -112,7 +112,7 @@ private:
|
||||
((cont[Offset + I] = std::tolower(cont[Offset + I])), ...);
|
||||
}
|
||||
|
||||
// we use ASCII non-alphanum character as UTF8 seperator
|
||||
// we use ASCII non-alphanum character as UTF8 separator
|
||||
static ALWAYS_INLINE inline bool isUTF8Sep(const UInt8 c) { return c < 128 && !isAlphaNumericASCII(c); }
|
||||
|
||||
// read one UTF8 character and return it
|
||||
|
@ -151,7 +151,7 @@ template <size_t N, typename CodePoint, bool UTF8, bool Ngram, bool CaseInsensit
|
||||
struct SimhashImpl
|
||||
{
|
||||
using StrOp = ExtractStringImpl<N, CaseInsensitive>;
|
||||
// we made an assumption that the size of one word cann't exceed 128, which may not true
|
||||
// we made an assumption that the size of one word can't exceed 128, which may not true
|
||||
// if some word's size exceed 128, it would be cut up to several word
|
||||
static constexpr size_t max_string_size = 1u << 15;
|
||||
static constexpr size_t simultaneously_codepoints_num = StrOp::buffer_size;
|
||||
@ -203,7 +203,7 @@ struct SimhashImpl
|
||||
return res_bit.to_ullong();
|
||||
}
|
||||
|
||||
// Simhash word shingle calculate funtion: String -> UInt64
|
||||
// Simhash word shingle calculate function: String -> UInt64
|
||||
// this function extracting n word shingle from input string, and maintain a 64-dimensions vector as well
|
||||
// for each word shingle, calculate a 64 bit hash value, and update the vector according the hash value
|
||||
// finally return a 64 bit value(UInt64), i'th bit is 1 means vector[i] > 0, otherwise, vector[i] < 0
|
||||
@ -211,8 +211,8 @@ struct SimhashImpl
|
||||
// word shingle hash value calculate:
|
||||
// 1. at the first, extracts N word shingles and calculate N hash values, store into an array, use this N hash values
|
||||
// to calculate the first word shingle hash value
|
||||
// 2. next, we extrac one word each time, and calculate a new hash value of the new word,then use the latest N hash
|
||||
// values to caculate the next word shingle hash value
|
||||
// 2. next, we extract one word each time, and calculate a new hash value of the new word,then use the latest N hash
|
||||
// values to calculate the next word shingle hash value
|
||||
static ALWAYS_INLINE inline UInt64 wordShinglesCalculateHashValue(
|
||||
const char * data,
|
||||
size_t size,
|
||||
@ -254,12 +254,12 @@ struct SimhashImpl
|
||||
// for example, N = 5, array |a0|a1|a2|a3|a4|, now , a0 is the oldest location,
|
||||
// so we need to store new word hash into location of a0, then ,this array become
|
||||
// |a5|a1|a2|a3|a4|, next time, a1 become the oldest location, we need to store new
|
||||
// word hash value into locaion of a1, then array become |a5|a6|a2|a3|a4|
|
||||
// word hash value into location of a1, then array become |a5|a6|a2|a3|a4|
|
||||
nword_hashes[offset] = Hash::hashSum(word_buf.data(), word_buf.size());
|
||||
offset = (offset + 1) % N;
|
||||
// according to the word hash storation way, in order to not lose the word shingle's
|
||||
// sequence information, when calculation word shingle hash value, we need provide the offset
|
||||
// inforation, which is the offset of the first word's hash value of the word shingle
|
||||
// information, which is the offset of the first word's hash value of the word shingle
|
||||
hash_value = hash_functor(nword_hashes, N, offset);
|
||||
std::bitset<64> bits(hash_value);
|
||||
for (size_t i = 0; i < 64; ++i)
|
||||
|
@ -31,7 +31,7 @@ message ExternalTable {
|
||||
|
||||
// Data to insert to the external table.
|
||||
// If a method with streaming input (i.e. ExecuteQueryWithStreamInput() or ExecuteQueryWithStreamIO()) is used,
|
||||
// then data for insertion to the same external table can be splitted between multiple QueryInfos.
|
||||
// then data for insertion to the same external table can be split between multiple QueryInfos.
|
||||
string data = 3;
|
||||
|
||||
// Format of the data to insert to the external table.
|
||||
|
@ -1,5 +1,4 @@
|
||||
#!/usr/bin/env bash
|
||||
set +x
|
||||
|
||||
# Check for typos in code.
|
||||
|
||||
|
@ -1,4 +1,4 @@
|
||||
wthenn
|
||||
thenn
|
||||
fpr
|
||||
creat
|
||||
parsering
|
||||
@ -6,8 +6,6 @@ nd
|
||||
ect
|
||||
pullrequest
|
||||
pullrequests
|
||||
ans
|
||||
splitted
|
||||
thenn
|
||||
ths
|
||||
offsett
|
||||
|
Loading…
Reference in New Issue
Block a user