ClickHouse/dbms/Functions/URL/URLPathHierarchy.cpp

#include <Functions/FunctionFactory.h>
#include <Functions/FunctionsStringArray.h>

namespace DB
{
namespace ErrorCodes
{
    extern const int ILLEGAL_TYPE_OF_ARGUMENT;
}

class URLHierarchyImpl
{
private:
    Pos begin;
    Pos pos;
    Pos end;

public:
    static constexpr auto name = "URLHierarchy";
    static String getName() { return name; }

    static size_t getNumberOfArguments() { return 1; }

    static void checkArguments(const DataTypes & arguments)
    {
        if (!isString(arguments[0]))
            throw Exception("Illegal type " + arguments[0]->getName() + " of first argument of function " + getName() + ". Must be String.",
            ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
    }

    void init(Block & /*block*/, const ColumnNumbers & /*arguments*/) {}

    /// Returns the position of the argument that is the column of rows
    static size_t getStringsArgumentPosition()
    {
        return 0;
    }

    /// Called for each next string.
    void set(Pos pos_, Pos end_)
    {
        begin = pos = pos_;
        end = end_;
    }

    /// Get the next token, if any, or return false.
    bool get(Pos & token_begin, Pos & token_end)
    {
        /// Code from URLParser.
        if (pos == end)
            return false;

        if (pos == begin)
        {
            /// Let's parse everything that goes before the path

            /// Assume that the protocol has already been changed to lowercase.
            while (pos < end && ((*pos > 'a' && *pos < 'z') || (*pos > '0' && *pos < '9')))
                ++pos;

            /** We will calculate the hierarchy only for URLs in which there is a protocol, and after it there are two slashes.
             * (http, file - fit, mailto, magnet - do not fit), and after two slashes still at least something is there
             * For the rest, simply return the full URL as the only element of the hierarchy.
             */
            if (pos == begin || pos == end || !(*pos++ == ':' && pos < end && *pos++ == '/' && pos < end && *pos++ == '/' && pos < end))
            {
                pos = end;
                token_begin = begin;
                token_end = end;
                return true;
            }

            /// The domain for simplicity is everything that after the protocol and two slashes, until the next slash or `?` or `#`
            while (pos < end && !(*pos == '/' || *pos == '?' || *pos == '#'))
                ++pos;

            if (pos != end)
                ++pos;

            token_begin = begin;
            token_end = pos;

            return true;
        }

        /// We go to the next `/` or `?` or `#`, skipping all those at the beginning.
        while (pos < end && (*pos == '/' || *pos == '?' || *pos == '#'))
            ++pos;
        if (pos == end)
            return false;
        while (pos < end && !(*pos == '/' || *pos == '?' || *pos == '#'))
            ++pos;

        if (pos != end)
            ++pos;

        token_begin = begin;
        token_end = pos;

        return true;
    }
};


struct NameURLHierarchy { static constexpr auto name = "URLHierarchy"; };
using FunctionURLHierarchy = FunctionTokens<URLHierarchyImpl>;

void registerFunctionURLHierarchy(FunctionFactory & factory)
{
    factory.registerFunction<FunctionURLHierarchy>();
}

}
Every function in its own file, part 6 (#3661) * Every function in its own file, part 6 [#CLICKHOUSE-2] * Every function in its own file, part 6 [#CLICKHOUSE-2] * Every function in its own file, part 6 [#CLICKHOUSE-2] 2018-11-25 23:24:26 +00:00			`#include <Functions/FunctionFactory.h>`
			`#include <Functions/FunctionsStringArray.h>`

			`namespace DB`
			`{`
Added missing ErrorCodes, part 2 2020-02-25 18:10:48 +00:00			`namespace ErrorCodes`
			`{`
			`extern const int ILLEGAL_TYPE_OF_ARGUMENT;`
			`}`
Every function in its own file, part 6 (#3661) * Every function in its own file, part 6 [#CLICKHOUSE-2] * Every function in its own file, part 6 [#CLICKHOUSE-2] * Every function in its own file, part 6 [#CLICKHOUSE-2] 2018-11-25 23:24:26 +00:00
			`class URLHierarchyImpl`
			`{`
			`private:`
			`Pos begin;`
			`Pos pos;`
			`Pos end;`

			`public:`
			`static constexpr auto name = "URLHierarchy";`
			`static String getName() { return name; }`

			`static size_t getNumberOfArguments() { return 1; }`

			`static void checkArguments(const DataTypes & arguments)`
			`{`
			`if (!isString(arguments[0]))`
			`throw Exception("Illegal type " + arguments[0]->getName() + " of first argument of function " + getName() + ". Must be String.",`
			`ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);`
			`}`

			`void init(Block & /block/, const ColumnNumbers & /arguments/) {}`

			`/// Returns the position of the argument that is the column of rows`
clang-tidy, part 2 2020-03-18 00:57:00 +00:00			`static size_t getStringsArgumentPosition()`
Every function in its own file, part 6 (#3661) * Every function in its own file, part 6 [#CLICKHOUSE-2] * Every function in its own file, part 6 [#CLICKHOUSE-2] * Every function in its own file, part 6 [#CLICKHOUSE-2] 2018-11-25 23:24:26 +00:00			`{`
			`return 0;`
			`}`

			`/// Called for each next string.`
			`void set(Pos pos_, Pos end_)`
			`{`
			`begin = pos = pos_;`
			`end = end_;`
			`}`

			`/// Get the next token, if any, or return false.`
			`bool get(Pos & token_begin, Pos & token_end)`
			`{`
			`/// Code from URLParser.`
			`if (pos == end)`
			`return false;`

			`if (pos == begin)`
			`{`
			`/// Let's parse everything that goes before the path`

			`/// Assume that the protocol has already been changed to lowercase.`
			`while (pos < end && ((pos > 'a' && pos < 'z') \|\| (pos > '0' && pos < '9')))`
			`++pos;`

			`/** We will calculate the hierarchy only for URLs in which there is a protocol, and after it there are two slashes.`
			`* (http, file - fit, mailto, magnet - do not fit), and after two slashes still at least something is there`
			`* For the rest, simply return the full URL as the only element of the hierarchy.`
			`*/`
			`if (pos == begin \|\| pos == end \|\| !(pos++ == ':' && pos < end && pos++ == '/' && pos < end && *pos++ == '/' && pos < end))`
			`{`
			`pos = end;`
			`token_begin = begin;`
			`token_end = end;`
			`return true;`
			`}`

			/// The domain for simplicity is everything that after the protocol and two slashes, until the next slash or `?` or `#`
			`while (pos < end && !(pos == '/' \|\| pos == '?' \|\| *pos == '#'))`
			`++pos;`

			`if (pos != end)`
			`++pos;`

			`token_begin = begin;`
			`token_end = pos;`

			`return true;`
			`}`

			/// We go to the next `/` or `?` or `#`, skipping all those at the beginning.
			`while (pos < end && (pos == '/' \|\| pos == '?' \|\| *pos == '#'))`
			`++pos;`
			`if (pos == end)`
			`return false;`
			`while (pos < end && !(pos == '/' \|\| pos == '?' \|\| *pos == '#'))`
			`++pos;`

			`if (pos != end)`
			`++pos;`

			`token_begin = begin;`
			`token_end = pos;`

			`return true;`
			`}`
			`};`


			`struct NameURLHierarchy { static constexpr auto name = "URLHierarchy"; };`
			`using FunctionURLHierarchy = FunctionTokens<URLHierarchyImpl>;`

			`void registerFunctionURLHierarchy(FunctionFactory & factory)`
			`{`
			`factory.registerFunction<FunctionURLHierarchy>();`
			`}`

			`}`