Merge pull request #57574 from ClibMouse/dev-seriesPeriodDetect

Revert "Revert "Implemented series period detect method using pocketfft lib""
This commit is contained in:
Julia Kartseva 2023-12-14 21:10:23 -08:00 committed by GitHub
commit 7a5b40563a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
13 changed files with 348 additions and 0 deletions

3
.gitmodules vendored
View File

@ -354,6 +354,9 @@
[submodule "contrib/aklomp-base64"]
path = contrib/aklomp-base64
url = https://github.com/aklomp/base64.git
[submodule "contrib/pocketfft"]
path = contrib/pocketfft
url = https://github.com/mreineck/pocketfft.git
[submodule "contrib/sqids-cpp"]
path = contrib/sqids-cpp
url = https://github.com/sqids/sqids-cpp.git

View File

@ -44,6 +44,7 @@ else ()
endif ()
add_contrib (miniselect-cmake miniselect)
add_contrib (pdqsort-cmake pdqsort)
add_contrib (pocketfft-cmake pocketfft)
add_contrib (crc32-vpmsum-cmake crc32-vpmsum)
add_contrib (sparsehash-c11-cmake sparsehash-c11)
add_contrib (abseil-cpp-cmake abseil-cpp)

1
contrib/pocketfft vendored Submodule

@ -0,0 +1 @@
Subproject commit 9efd4da52cf8d28d14531d14e43ad9d913807546

View File

@ -0,0 +1,10 @@
option (ENABLE_POCKETFFT "Enable pocketfft" ${ENABLE_LIBRARIES})
if (NOT ENABLE_POCKETFFT)
message(STATUS "Not using pocketfft")
return()
endif()
add_library(_pocketfft INTERFACE)
target_include_directories(_pocketfft INTERFACE ${ClickHouse_SOURCE_DIR}/contrib/pocketfft)
add_library(ch_contrib::pocketfft ALIAS _pocketfft)

View File

@ -0,0 +1,59 @@
---
slug: /en/sql-reference/functions/time-series-functions
sidebar_position: 172
sidebar_label: Time Series
---
# Time Series Functions
Below functions are used for time series analysis.
## seriesPeriodDetectFFT
Finds the period of the given time series data using FFT
FFT - [Fast Fourier transform](https://en.wikipedia.org/wiki/Fast_Fourier_transform)
**Syntax**
``` sql
seriesPeriodDetectFFT(series);
```
**Arguments**
- `series` - An array of numeric values
**Returned value**
- A real value equal to the period of time series
- Returns NAN when number of data points are less than four.
Type: [Float64](../../sql-reference/data-types/float.md).
**Examples**
Query:
``` sql
SELECT seriesPeriodDetectFFT([1, 4, 6, 1, 4, 6, 1, 4, 6, 1, 4, 6, 1, 4, 6, 1, 4, 6, 1, 4, 6]) AS print_0;
```
Result:
``` text
┌───────────print_0──────┐
│ 3 │
└────────────────────────┘
```
``` sql
SELECT seriesPeriodDetectFFT(arrayMap(x -> abs((x % 6) - 3), range(1000))) AS print_0;
```
Result:
``` text
┌─print_0─┐
│ 6 │
└─────────┘
```

View File

@ -436,6 +436,10 @@ dbms_target_link_libraries(PRIVATE ch_contrib::zstd)
target_link_libraries (clickhouse_common_io PUBLIC ch_contrib::zstd)
target_link_libraries (clickhouse_common_io PUBLIC ch_contrib::xz)
if (TARGET ch_contrib::pocketfft)
target_link_libraries(clickhouse_common_io PUBLIC ch_contrib::pocketfft)
endif ()
if (TARGET ch_contrib::icu)
dbms_target_link_libraries (PRIVATE ch_contrib::icu)
endif ()

View File

@ -62,6 +62,7 @@
#cmakedefine01 FIU_ENABLE
#cmakedefine01 USE_BCRYPT
#cmakedefine01 USE_LIBARCHIVE
#cmakedefine01 USE_POCKETFFT
/// This is needed for .incbin in assembly. For some reason, include paths don't work there in presence of LTO.
/// That's why we use absolute paths.

View File

@ -99,6 +99,10 @@ if (TARGET ch_contrib::rapidjson)
list (APPEND PRIVATE_LIBS ch_contrib::rapidjson)
endif()
if (TARGET ch_contrib::pocketfft)
list (APPEND PRIVATE_LIBS ch_contrib::pocketfft)
endif()
if (TARGET ch_contrib::crc32-vpmsum)
list (APPEND PUBLIC_LIBS ch_contrib::crc32-vpmsum)
endif()

View File

@ -0,0 +1,227 @@
#include "config.h"
#if USE_POCKETFFT
# ifdef __clang__
# pragma clang diagnostic push
# pragma clang diagnostic ignored "-Wshadow"
# pragma clang diagnostic ignored "-Wextra-semi-stmt"
# pragma clang diagnostic ignored "-Wzero-as-null-pointer-constant"
# endif
# include <pocketfft_hdronly.h>
# ifdef __clang__
# pragma clang diagnostic pop
# endif
# include <cmath>
# include <Columns/ColumnArray.h>
# include <Columns/ColumnsNumber.h>
# include <DataTypes/DataTypeArray.h>
# include <DataTypes/DataTypesNumber.h>
# include <Functions/FunctionFactory.h>
# include <Functions/FunctionHelpers.h>
# include <Functions/IFunction.h>
namespace DB
{
namespace ErrorCodes
{
extern const int ILLEGAL_COLUMN;
}
/*Detect Period in time series data using FFT.
* FFT - Fast Fourier transform (https://en.wikipedia.org/wiki/Fast_Fourier_transform)
* 1. Convert time series data to frequency domain using FFT.
* 2. Remove the 0th(the Dc component) and n/2th the Nyquist frequency
* 3. Find the peak value (highest) for dominant frequency component.
* 4. Inverse of the dominant frequency component is the period.
*/
class FunctionSeriesPeriodDetectFFT : public IFunction
{
public:
static constexpr auto name = "seriesPeriodDetectFFT";
static FunctionPtr create(ContextPtr) { return std::make_shared<FunctionSeriesPeriodDetectFFT>(); }
std::string getName() const override { return name; }
size_t getNumberOfArguments() const override { return 1; }
bool useDefaultImplementationForConstants() const override { return true; }
bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; }
DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override
{
FunctionArgumentDescriptors args{{"time_series", &isArray<IDataType>, nullptr, "Array"}};
validateFunctionArgumentTypes(*this, arguments, args);
return std::make_shared<DataTypeFloat64>();
}
ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override
{
ColumnPtr array_ptr = arguments[0].column;
const ColumnArray * array = checkAndGetColumn<ColumnArray>(array_ptr.get());
const IColumn & src_data = array->getData();
const ColumnArray::Offsets & offsets = array->getOffsets();
auto res = ColumnFloat64::create(input_rows_count);
auto & res_data = res->getData();
ColumnArray::Offset prev_src_offset = 0;
Float64 period;
for (size_t i = 0; i < input_rows_count; ++i)
{
ColumnArray::Offset curr_offset = offsets[i];
if (executeNumbers<UInt8>(src_data, period, prev_src_offset, curr_offset)
|| executeNumbers<UInt16>(src_data, period, prev_src_offset, curr_offset)
|| executeNumbers<UInt32>(src_data, period, prev_src_offset, curr_offset)
|| executeNumbers<UInt64>(src_data, period, prev_src_offset, curr_offset)
|| executeNumbers<Int8>(src_data, period, prev_src_offset, curr_offset)
|| executeNumbers<Int16>(src_data, period, prev_src_offset, curr_offset)
|| executeNumbers<Int32>(src_data, period, prev_src_offset, curr_offset)
|| executeNumbers<Int64>(src_data, period, prev_src_offset, curr_offset)
|| executeNumbers<Float32>(src_data, period, prev_src_offset, curr_offset)
|| executeNumbers<Float64>(src_data, period, prev_src_offset, curr_offset))
{
res_data[i] = period;
prev_src_offset = curr_offset;
}
else
throw Exception(
ErrorCodes::ILLEGAL_COLUMN,
"Illegal column {} of first argument of function {}",
arguments[0].column->getName(),
getName());
}
return res;
}
template <typename T>
bool executeNumbers(const IColumn & src_data, Float64 & period, ColumnArray::Offset & start, ColumnArray::Offset & end) const
{
const ColumnVector<T> * src_data_concrete = checkAndGetColumn<ColumnVector<T>>(&src_data);
if (!src_data_concrete)
return false;
const PaddedPODArray<T> & src_vec = src_data_concrete->getData();
chassert(start <= end);
size_t len = end - start;
if (len < 4)
{
period = NAN; // At least four data points are required to detect period
return true;
}
std::vector<Float64> src((src_vec.begin() + start), (src_vec.begin() + end));
std::vector<std::complex<double>> out((len / 2) + 1);
pocketfft::shape_t shape{len};
pocketfft::shape_t axes;
axes.reserve(shape.size());
for (size_t i = 0; i < shape.size(); ++i)
axes.push_back(i);
pocketfft::stride_t stride_src{sizeof(double)};
pocketfft::stride_t stride_out{sizeof(std::complex<double>)};
pocketfft::r2c(shape, stride_src, stride_out, axes, pocketfft::FORWARD, src.data(), out.data(), static_cast<double>(1));
size_t spec_len = (len - 1) / 2; //removing the nyquist element when len is even
double max_mag = 0;
size_t idx = 1;
for (size_t i = 1; i < spec_len; ++i)
{
double magnitude = sqrt(out[i].real() * out[i].real() + out[i].imag() * out[i].imag());
if (magnitude > max_mag)
{
max_mag = magnitude;
idx = i;
}
}
// In case all FFT values are zero, it means the input signal is flat.
// It implies the period of the series should be 0.
if (max_mag == 0)
{
period = 0;
return true;
}
std::vector<double> xfreq(spec_len);
double step = 0.5 / (spec_len - 1);
for (size_t i = 0; i < spec_len; ++i)
xfreq[i] = i * step;
auto freq = xfreq[idx];
period = std::round(1 / freq);
return true;
}
};
REGISTER_FUNCTION(SeriesPeriodDetectFFT)
{
factory.registerFunction<FunctionSeriesPeriodDetectFFT>(FunctionDocumentation{
.description = R"(
Finds the period of the given time series data using FFT
FFT - Fast Fourier transform (https://en.wikipedia.org/wiki/Fast_Fourier_transform)
**Syntax**
``` sql
seriesPeriodDetectFFT(series);
```
**Arguments**
- `series` - An array of numeric values
**Returned value**
- A real value equal to the period of time series
- Returns NAN when number of data points are less than four.
Type: [Float64](../../sql-reference/data-types/float.md).
**Examples**
Query:
``` sql
SELECT seriesPeriodDetectFFT([1, 4, 6, 1, 4, 6, 1, 4, 6, 1, 4, 6, 1, 4, 6, 1, 4, 6, 1, 4, 6]) AS print_0;
```
Result:
``` text
print_0
3
```
``` sql
SELECT seriesPeriodDetectFFT(arrayMap(x -> abs((x % 6) - 3), range(1000))) AS print_0;
```
Result:
``` text
print_0
6
```
)",
.categories{"Time series analysis"}});
}
}
#endif

View File

@ -169,5 +169,8 @@ endif()
if (TARGET ch_contrib::libarchive)
set(USE_LIBARCHIVE 1)
endif()
if (TARGET ch_contrib::pocketfft)
set(USE_POCKETFFT 1)
endif()
set(SOURCE_DIR ${PROJECT_SOURCE_DIR})

View File

@ -0,0 +1,12 @@
14
3
3
3
0
62
6
6
nan
3
0
nan

View File

@ -0,0 +1,22 @@
-- Tags: no-fasttest
DROP TABLE IF EXISTS tb1;
CREATE TABLE tb1 (n UInt32, a Array(Int32)) engine=Memory;
INSERT INTO tb1 VALUES (1, [10, 20, 30, 10, 20, 30, 10, 20, 30, 10, 20, 30, 10, 20, 30, 10, 20, 30, 10, 20, 30]), (2, [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]), (3, [6, 3, 4]);
SELECT seriesPeriodDetectFFT([139, 87, 110, 68, 54, 50, 51, 53, 133, 86, 141, 97, 156, 94, 149, 95, 140, 77, 61, 50, 54, 47, 133, 72, 152, 94, 148, 105, 162, 101, 160, 87, 63, 53, 55, 54, 151, 103, 189, 108, 183, 113, 175, 113, 178, 90, 71, 62, 62, 65, 165, 109, 181, 115, 182, 121, 178, 114, 170]);
SELECT seriesPeriodDetectFFT([10, 20, 30, 10, 20, 30, 10, 20, 30, 10, 20, 30, 10, 20, 30, 10, 20, 30, 10, 20, 30]);
SELECT seriesPeriodDetectFFT([10.1, 20.45, 40.34, 10.1, 20.45, 40.34, 10.1, 20.45, 40.34, 10.1, 20.45, 40.34, 10.1, 20.45, 40.34, 10.1, 20.45, 40.34, 10.1, 20.45, 40.34, 10.1, 20.45, 40.34]);
SELECT seriesPeriodDetectFFT([10.1, 10, 400, 10.1, 10, 400, 10.1, 10, 400, 10.1, 10, 400, 10.1, 10, 400, 10.1, 10, 400, 10.1, 10, 400, 10.1, 10, 400]);
SELECT seriesPeriodDetectFFT([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]);
SELECT seriesPeriodDetectFFT(arrayMap(x -> sin(x / 10), range(1000)));
SELECT seriesPeriodDetectFFT(arrayMap(x -> abs((x % 6) - 3), range(1000)));
SELECT seriesPeriodDetectFFT(arrayMap(x -> if((x % 6) < 3, 3, 0), range(1000)));
SELECT seriesPeriodDetectFFT([1,2,3]);
SELECT seriesPeriodDetectFFT(a) FROM tb1;
DROP TABLE IF EXISTS tb1;
SELECT seriesPeriodDetectFFT(); --{ serverError NUMBER_OF_ARGUMENTS_DOESNT_MATCH}
SELECT seriesPeriodDetectFFT([]); -- { serverError ILLEGAL_COLUMN}
SELECT seriesPeriodDetectFFT([NULL, NULL, NULL]); -- { serverError ILLEGAL_COLUMN}
SELECT seriesPeriodDetectFFT([10, 20, 30, 10, 202, 30, NULL]); -- { serverError ILLEGAL_COLUMN }

View File

@ -2233,6 +2233,7 @@ seektable
sequenceCount
sequenceMatch
sequenceNextNode
seriesPeriodDetectFFT
serverTimeZone
serverTimezone
serverUUID