Add integration tests for hive query

2024-11-24 16:42:05 +00:00 · 2021-11-18 16:17:49 +08:00 · 2021-11-18 16:17:49 +08:00 · 9902ccefc5
commit 9902ccefc5
parent f33ec0fd47
16 changed files with 740 additions and 5 deletions
--- a/docker/test/integration/hive_server/Dockerfile
+++ b/docker/test/integration/hive_server/Dockerfile
@ -0,0 +1,47 @@
+FROM ubuntu:20.04
+MAINTAINER lgbo-ustc <lgbo.ustc@gmail.com>
+
+RUN apt-get update 
+RUN apt-get install -y wget openjdk-8-jre
+
+RUN wget https://archive.apache.org/dist/hadoop/common/hadoop-3.1.0/hadoop-3.1.0.tar.gz && \
+        tar -xf hadoop-3.1.0.tar.gz && rm -rf hadoop-3.1.0.tar.gz
+RUN wget https://dlcdn.apache.org/hive/hive-2.3.9/apache-hive-2.3.9-bin.tar.gz && \
+        tar -xf apache-hive-2.3.9-bin.tar.gz && rm -rf apache-hive-2.3.9-bin.tar.gz
+RUN apt install -y vim
+
+RUN apt install -y openssh-server openssh-client
+
+RUN apt install -y mysql-server
+
+RUN mkdir -p /root/.ssh && \
+        ssh-keygen -t rsa -b 2048 -P '' -f /root/.ssh/id_rsa && \
+        cat /root/.ssh/id_rsa.pub > /root/.ssh/authorized_keys && \
+        cp /root/.ssh/id_rsa /etc/ssh/ssh_host_rsa_key && \
+        cp /root/.ssh/id_rsa.pub /etc/ssh/ssh_host_rsa_key.pub
+
+RUN wget https://dev.mysql.com/get/Downloads/Connector-J/mysql-connector-java-8.0.27.tar.gz &&\
+        tar -xf mysql-connector-java-8.0.27.tar.gz && \
+        mv mysql-connector-java-8.0.27/mysql-connector-java-8.0.27.jar /apache-hive-2.3.9-bin/lib/ && \
+        rm -rf mysql-connector-java-8.0.27.tar.gz mysql-connector-java-8.0.27
+
+RUN apt install -y iputils-ping net-tools
+
+ENV JAVA_HOME=/usr
+ENV HADOOP_HOME=/hadoop-3.1.0
+ENV HDFS_NAMENODE_USER=root
+ENV HDFS_DATANODE_USER=root HDFS_SECONDARYNAMENODE_USER=root YARN_RESOURCEMANAGER_USER=root YARN_NODEMANAGER_USER=root HDFS_DATANODE_SECURE_USER=hdfs
+COPY hdfs-site.xml /hadoop-3.1.0/etc/hadoop
+COPY mapred-site.xml /hadoop-3.1.0/etc/hadoop
+COPY yarn-site.xml /hadoop-3.1.0/etc/hadoop
+COPY hadoop-env.sh /hadoop-3.1.0/etc/hadoop/
+#COPY core-site.xml /hadoop-3.1.0/etc/hadoop
+COPY core-site.xml.template /hadoop-3.1.0/etc/hadoop
+COPY hive-site.xml /apache-hive-2.3.9-bin/conf
+COPY prepare_hive_data.sh /
+COPY demo_data.txt /
+
+ENV PATH=/apache-hive-2.3.9-bin/bin:/hadoop-3.1.0/bin:/hadoop-3.1.0/sbin:$PATH
+
+COPY start.sh /
+
--- a/docker/test/integration/hive_server/core-site.xml.template
+++ b/docker/test/integration/hive_server/core-site.xml.template
@ -0,0 +1,14 @@
+  <configuration>
+      <property>
+          <name>fs.defaultFS</name>
+          <value>hdfs://HOSTNAME:9000</value>
+      </property>
+      <property>
+          <name>hadoop.proxyuser.root.hosts</name>
+          <value>*</value>
+      </property>
+      <property>
+          <name>hadoop.proxyuser.root.groups</name>
+          <value>*</value>
+      </property>
+  </configuration>
--- a/docker/test/integration/hive_server/demo_data.txt
+++ b/docker/test/integration/hive_server/demo_data.txt
@ -0,0 +1,6 @@
+abc,1,2021-11-16
+abd,15,2021-11-05
+aaa,22,2021-11-16
+dda,0,2021-11-01
+dfb,11,2021-11-05
+jhn,89,2021-11-11
--- a/docker/test/integration/hive_server/hadoop-env.sh
+++ b/docker/test/integration/hive_server/hadoop-env.sh
@ -0,0 +1,422 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Set Hadoop-specific environment variables here.
+
+##
+## THIS FILE ACTS AS THE MASTER FILE FOR ALL HADOOP PROJECTS.
+## SETTINGS HERE WILL BE READ BY ALL HADOOP COMMANDS.  THEREFORE,
+## ONE CAN USE THIS FILE TO SET YARN, HDFS, AND MAPREDUCE
+## CONFIGURATION OPTIONS INSTEAD OF xxx-env.sh.
+##
+## Precedence rules:
+##
+## {yarn-env.sh|hdfs-env.sh} > hadoop-env.sh > hard-coded defaults
+##
+## {YARN_xyz|HDFS_xyz} > HADOOP_xyz > hard-coded defaults
+##
+
+# Many of the options here are built from the perspective that users
+# may want to provide OVERWRITING values on the command line.
+# For example:
+#
+JAVA_HOME=/usr/
+#
+# Therefore, the vast majority (BUT NOT ALL!) of these defaults
+# are configured for substitution and not append.  If append
+# is preferable, modify this file accordingly.
+
+###
+# Generic settings for HADOOP
+###
+
+# Technically, the only required environment variable is JAVA_HOME.
+# All others are optional.  However, the defaults are probably not
+# preferred.  Many sites configure these options outside of Hadoop,
+# such as in /etc/profile.d
+
+# The java implementation to use. By default, this environment
+# variable is REQUIRED on ALL platforms except OS X!
+# export JAVA_HOME=
+
+# Location of Hadoop.  By default, Hadoop will attempt to determine
+# this location based upon its execution path.
+# export HADOOP_HOME=
+
+# Location of Hadoop's configuration information.  i.e., where this
+# file is living. If this is not defined, Hadoop will attempt to
+# locate it based upon its execution path.
+#
+# NOTE: It is recommend that this variable not be set here but in
+# /etc/profile.d or equivalent.  Some options (such as
+# --config) may react strangely otherwise.
+#
+# export HADOOP_CONF_DIR=${HADOOP_HOME}/etc/hadoop
+
+# The maximum amount of heap to use (Java -Xmx).  If no unit
+# is provided, it will be converted to MB.  Daemons will
+# prefer any Xmx setting in their respective _OPT variable.
+# There is no default; the JVM will autoscale based upon machine
+# memory size.
+# export HADOOP_HEAPSIZE_MAX=
+
+# The minimum amount of heap to use (Java -Xms).  If no unit
+# is provided, it will be converted to MB.  Daemons will
+# prefer any Xms setting in their respective _OPT variable.
+# There is no default; the JVM will autoscale based upon machine
+# memory size.
+# export HADOOP_HEAPSIZE_MIN=
+
+# Enable extra debugging of Hadoop's JAAS binding, used to set up
+# Kerberos security.
+# export HADOOP_JAAS_DEBUG=true
+
+# Extra Java runtime options for all Hadoop commands. We don't support
+# IPv6 yet/still, so by default the preference is set to IPv4.
+# export HADOOP_OPTS="-Djava.net.preferIPv4Stack=true"
+# For Kerberos debugging, an extended option set logs more invormation
+# export HADOOP_OPTS="-Djava.net.preferIPv4Stack=true -Dsun.security.krb5.debug=true -Dsun.security.spnego.debug"
+
+# Some parts of the shell code may do special things dependent upon
+# the operating system.  We have to set this here. See the next
+# section as to why....
+export HADOOP_OS_TYPE=${HADOOP_OS_TYPE:-$(uname -s)}
+
+
+# Under certain conditions, Java on OS X will throw SCDynamicStore errors
+# in the system logs.
+# See HADOOP-8719 for more information.  If one needs Kerberos
+# support on OS X, one will want to change/remove this extra bit.
+case ${HADOOP_OS_TYPE} in
+  Darwin*)
+    export HADOOP_OPTS="${HADOOP_OPTS} -Djava.security.krb5.realm= "
+    export HADOOP_OPTS="${HADOOP_OPTS} -Djava.security.krb5.kdc= "
+    export HADOOP_OPTS="${HADOOP_OPTS} -Djava.security.krb5.conf= "
+  ;;
+esac
+
+# Extra Java runtime options for some Hadoop commands
+# and clients (i.e., hdfs dfs -blah).  These get appended to HADOOP_OPTS for
+# such commands.  In most cases, # this should be left empty and
+# let users supply it on the command line.
+# export HADOOP_CLIENT_OPTS=""
+
+#
+# A note about classpaths.
+#
+# By default, Apache Hadoop overrides Java's CLASSPATH
+# environment variable.  It is configured such
+# that it sarts out blank with new entries added after passing
+# a series of checks (file/dir exists, not already listed aka
+# de-deduplication).  During de-depulication, wildcards and/or
+# directories are *NOT* expanded to keep it simple. Therefore,
+# if the computed classpath has two specific mentions of
+# awesome-methods-1.0.jar, only the first one added will be seen.
+# If two directories are in the classpath that both contain
+# awesome-methods-1.0.jar, then Java will pick up both versions.
+
+# An additional, custom CLASSPATH. Site-wide configs should be
+# handled via the shellprofile functionality, utilizing the
+# hadoop_add_classpath function for greater control and much
+# harder for apps/end-users to accidentally override.
+# Similarly, end users should utilize ${HOME}/.hadooprc .
+# This variable should ideally only be used as a short-cut,
+# interactive way for temporary additions on the command line.
+# export HADOOP_CLASSPATH="/some/cool/path/on/your/machine"
+
+# Should HADOOP_CLASSPATH be first in the official CLASSPATH?
+# export HADOOP_USER_CLASSPATH_FIRST="yes"
+
+# If HADOOP_USE_CLIENT_CLASSLOADER is set, the classpath along
+# with the main jar are handled by a separate isolated
+# client classloader when 'hadoop jar', 'yarn jar', or 'mapred job'
+# is utilized. If it is set, HADOOP_CLASSPATH and
+# HADOOP_USER_CLASSPATH_FIRST are ignored.
+# export HADOOP_USE_CLIENT_CLASSLOADER=true
+
+# HADOOP_CLIENT_CLASSLOADER_SYSTEM_CLASSES overrides the default definition of
+# system classes for the client classloader when HADOOP_USE_CLIENT_CLASSLOADER
+# is enabled. Names ending in '.' (period) are treated as package names, and
+# names starting with a '-' are treated as negative matches. For example,
+# export HADOOP_CLIENT_CLASSLOADER_SYSTEM_CLASSES="-org.apache.hadoop.UserClass,java.,javax.,org.apache.hadoop."
+
+# Enable optional, bundled Hadoop features
+# This is a comma delimited list.  It may NOT be overridden via .hadooprc
+# Entries may be added/removed as needed.
+# export HADOOP_OPTIONAL_TOOLS="hadoop-openstack,hadoop-aliyun,hadoop-azure,hadoop-azure-datalake,hadoop-aws,hadoop-kafka"
+
+###
+# Options for remote shell connectivity
+###
+
+# There are some optional components of hadoop that allow for
+# command and control of remote hosts.  For example,
+# start-dfs.sh will attempt to bring up all NNs, DNS, etc.
+
+# Options to pass to SSH when one of the "log into a host and
+# start/stop daemons" scripts is executed
+# export HADOOP_SSH_OPTS="-o BatchMode=yes -o StrictHostKeyChecking=no -o ConnectTimeout=10s"
+
+# The built-in ssh handler will limit itself to 10 simultaneous connections.
+# For pdsh users, this sets the fanout size ( -f )
+# Change this to increase/decrease as necessary.
+# export HADOOP_SSH_PARALLEL=10
+
+# Filename which contains all of the hosts for any remote execution
+# helper scripts # such as workers.sh, start-dfs.sh, etc.
+# export HADOOP_WORKERS="${HADOOP_CONF_DIR}/workers"
+
+###
+# Options for all daemons
+###
+#
+
+#
+# Many options may also be specified as Java properties.  It is
+# very common, and in many cases, desirable, to hard-set these
+# in daemon _OPTS variables.  Where applicable, the appropriate
+# Java property is also identified.  Note that many are re-used
+# or set differently in certain contexts (e.g., secure vs
+# non-secure)
+#
+
+# Where (primarily) daemon log files are stored.
+# ${HADOOP_HOME}/logs by default.
+# Java property: hadoop.log.dir
+# export HADOOP_LOG_DIR=${HADOOP_HOME}/logs
+
+# A string representing this instance of hadoop. $USER by default.
+# This is used in writing log and pid files, so keep that in mind!
+# Java property: hadoop.id.str
+# export HADOOP_IDENT_STRING=$USER
+
+# How many seconds to pause after stopping a daemon
+# export HADOOP_STOP_TIMEOUT=5
+
+# Where pid files are stored.  /tmp by default.
+# export HADOOP_PID_DIR=/tmp
+
+# Default log4j setting for interactive commands
+# Java property: hadoop.root.logger
+# export HADOOP_ROOT_LOGGER=INFO,console
+
+# Default log4j setting for daemons spawned explicitly by
+# --daemon option of hadoop, hdfs, mapred and yarn command.
+# Java property: hadoop.root.logger
+# export HADOOP_DAEMON_ROOT_LOGGER=INFO,RFA
+
+# Default log level and output location for security-related messages.
+# You will almost certainly want to change this on a per-daemon basis via
+# the Java property (i.e., -Dhadoop.security.logger=foo). (Note that the
+# defaults for the NN and 2NN override this by default.)
+# Java property: hadoop.security.logger
+# export HADOOP_SECURITY_LOGGER=INFO,NullAppender
+
+# Default process priority level
+# Note that sub-processes will also run at this level!
+# export HADOOP_NICENESS=0
+
+# Default name for the service level authorization file
+# Java property: hadoop.policy.file
+# export HADOOP_POLICYFILE="hadoop-policy.xml"
+
+#
+# NOTE: this is not used by default!  <-----
+# You can define variables right here and then re-use them later on.
+# For example, it is common to use the same garbage collection settings
+# for all the daemons.  So one could define:
+#
+# export HADOOP_GC_SETTINGS="-verbose:gc -XX:+PrintGCDetails -XX:+PrintGCTimeStamps -XX:+PrintGCDateStamps"
+#
+# .. and then use it as per the b option under the namenode.
+
+###
+# Secure/privileged execution
+###
+
+#
+# Out of the box, Hadoop uses jsvc from Apache Commons to launch daemons
+# on privileged ports.  This functionality can be replaced by providing
+# custom functions.  See hadoop-functions.sh for more information.
+#
+
+# The jsvc implementation to use. Jsvc is required to run secure datanodes
+# that bind to privileged ports to provide authentication of data transfer
+# protocol.  Jsvc is not required if SASL is configured for authentication of
+# data transfer protocol using non-privileged ports.
+# export JSVC_HOME=/usr/bin
+
+#
+# This directory contains pids for secure and privileged processes.
+#export HADOOP_SECURE_PID_DIR=${HADOOP_PID_DIR}
+
+#
+# This directory contains the logs for secure and privileged processes.
+# Java property: hadoop.log.dir
+# export HADOOP_SECURE_LOG=${HADOOP_LOG_DIR}
+
+#
+# When running a secure daemon, the default value of HADOOP_IDENT_STRING
+# ends up being a bit bogus.  Therefore, by default, the code will
+# replace HADOOP_IDENT_STRING with HADOOP_xx_SECURE_USER.  If one wants
+# to keep HADOOP_IDENT_STRING untouched, then uncomment this line.
+# export HADOOP_SECURE_IDENT_PRESERVE="true"
+
+###
+# NameNode specific parameters
+###
+
+# Default log level and output location for file system related change
+# messages. For non-namenode daemons, the Java property must be set in
+# the appropriate _OPTS if one wants something other than INFO,NullAppender
+# Java property: hdfs.audit.logger
+# export HDFS_AUDIT_LOGGER=INFO,NullAppender
+
+# Specify the JVM options to be used when starting the NameNode.
+# These options will be appended to the options specified as HADOOP_OPTS
+# and therefore may override any similar flags set in HADOOP_OPTS
+#
+# a) Set JMX options
+# export HDFS_NAMENODE_OPTS="-Dcom.sun.management.jmxremote=true -Dcom.sun.management.jmxremote.authenticate=false -Dcom.sun.management.jmxremote.ssl=false -Dcom.sun.management.jmxremote.port=1026"
+#
+# b) Set garbage collection logs
+# export HDFS_NAMENODE_OPTS="${HADOOP_GC_SETTINGS} -Xloggc:${HADOOP_LOG_DIR}/gc-rm.log-$(date +'%Y%m%d%H%M')"
+#
+# c) ... or set them directly
+# export HDFS_NAMENODE_OPTS="-verbose:gc -XX:+PrintGCDetails -XX:+PrintGCTimeStamps -XX:+PrintGCDateStamps -Xloggc:${HADOOP_LOG_DIR}/gc-rm.log-$(date +'%Y%m%d%H%M')"
+
+# this is the default:
+# export HDFS_NAMENODE_OPTS="-Dhadoop.security.logger=INFO,RFAS"
+
+###
+# SecondaryNameNode specific parameters
+###
+# Specify the JVM options to be used when starting the SecondaryNameNode.
+# These options will be appended to the options specified as HADOOP_OPTS
+# and therefore may override any similar flags set in HADOOP_OPTS
+#
+# This is the default:
+# export HDFS_SECONDARYNAMENODE_OPTS="-Dhadoop.security.logger=INFO,RFAS"
+
+###
+# DataNode specific parameters
+###
+# Specify the JVM options to be used when starting the DataNode.
+# These options will be appended to the options specified as HADOOP_OPTS
+# and therefore may override any similar flags set in HADOOP_OPTS
+#
+# This is the default:
+# export HDFS_DATANODE_OPTS="-Dhadoop.security.logger=ERROR,RFAS"
+
+# On secure datanodes, user to run the datanode as after dropping privileges.
+# This **MUST** be uncommented to enable secure HDFS if using privileged ports
+# to provide authentication of data transfer protocol.  This **MUST NOT** be
+# defined if SASL is configured for authentication of data transfer protocol
+# using non-privileged ports.
+# This will replace the hadoop.id.str Java property in secure mode.
+# export HDFS_DATANODE_SECURE_USER=hdfs
+
+# Supplemental options for secure datanodes
+# By default, Hadoop uses jsvc which needs to know to launch a
+# server jvm.
+# export HDFS_DATANODE_SECURE_EXTRA_OPTS="-jvm server"
+
+###
+# NFS3 Gateway specific parameters
+###
+# Specify the JVM options to be used when starting the NFS3 Gateway.
+# These options will be appended to the options specified as HADOOP_OPTS
+# and therefore may override any similar flags set in HADOOP_OPTS
+#
+# export HDFS_NFS3_OPTS=""
+
+# Specify the JVM options to be used when starting the Hadoop portmapper.
+# These options will be appended to the options specified as HADOOP_OPTS
+# and therefore may override any similar flags set in HADOOP_OPTS
+#
+# export HDFS_PORTMAP_OPTS="-Xmx512m"
+
+# Supplemental options for priviliged gateways
+# By default, Hadoop uses jsvc which needs to know to launch a
+# server jvm.
+# export HDFS_NFS3_SECURE_EXTRA_OPTS="-jvm server"
+
+# On privileged gateways, user to run the gateway as after dropping privileges
+# This will replace the hadoop.id.str Java property in secure mode.
+# export HDFS_NFS3_SECURE_USER=nfsserver
+
+###
+# ZKFailoverController specific parameters
+###
+# Specify the JVM options to be used when starting the ZKFailoverController.
+# These options will be appended to the options specified as HADOOP_OPTS
+# and therefore may override any similar flags set in HADOOP_OPTS
+#
+# export HDFS_ZKFC_OPTS=""
+
+###
+# QuorumJournalNode specific parameters
+###
+# Specify the JVM options to be used when starting the QuorumJournalNode.
+# These options will be appended to the options specified as HADOOP_OPTS
+# and therefore may override any similar flags set in HADOOP_OPTS
+#
+# export HDFS_JOURNALNODE_OPTS=""
+
+###
+# HDFS Balancer specific parameters
+###
+# Specify the JVM options to be used when starting the HDFS Balancer.
+# These options will be appended to the options specified as HADOOP_OPTS
+# and therefore may override any similar flags set in HADOOP_OPTS
+#
+# export HDFS_BALANCER_OPTS=""
+
+###
+# HDFS Mover specific parameters
+###
+# Specify the JVM options to be used when starting the HDFS Mover.
+# These options will be appended to the options specified as HADOOP_OPTS
+# and therefore may override any similar flags set in HADOOP_OPTS
+#
+# export HDFS_MOVER_OPTS=""
+
+###
+# Router-based HDFS Federation specific parameters
+# Specify the JVM options to be used when starting the RBF Routers.
+# These options will be appended to the options specified as HADOOP_OPTS
+# and therefore may override any similar flags set in HADOOP_OPTS
+#
+# export HDFS_DFSROUTER_OPTS=""
+###
+
+###
+# Advanced Users Only!
+###
+
+#
+# When building Hadoop, one can add the class paths to the commands
+# via this special env var:
+# export HADOOP_ENABLE_BUILD_PATHS="true"
+
+#
+# To prevent accidents, shell commands be (superficially) locked
+# to only allow certain users to execute certain subcommands.
+# It uses the format of (command)_(subcommand)_USER.
+#
+# For example, to limit who can execute the namenode command,
+# export HDFS_NAMENODE_USER=hdfs
--- a/docker/test/integration/hive_server/hdfs-site.xml
+++ b/docker/test/integration/hive_server/hdfs-site.xml
@ -0,0 +1,6 @@
+<configuration>
+    <property>
+        <name>dfs.replication</name>
+        <value>1</value>
+    </property>
+</configuration>
--- a/docker/test/integration/hive_server/hive-site.xml
+++ b/docker/test/integration/hive_server/hive-site.xml
@ -0,0 +1,35 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<?xml-stylesheet type="text/xsl" href="configuration.xsl"?><!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+<configuration>
+    <property>
+        <name>javax.jdo.option.ConnectionURL</name>
+        <value>jdbc:mysql://localhost/hcatalog?createDatabaseIfNotExist=true</value>
+    </property>
+    <property>
+        <name>javax.jdo.option.ConnectionUserName</name>
+        <value>test</value>
+    </property>
+    <property>
+        <name>javax.jdo.option.ConnectionPassword</name>
+        <value>test</value>
+    </property>
+    <property>
+        <name>javax.jdo.option.ConnectionDriverName</name>
+        <value>com.mysql.jdbc.Driver</value>
+    </property>
+</configuration>
--- a/docker/test/integration/hive_server/mapred-site.xml
+++ b/docker/test/integration/hive_server/mapred-site.xml
@ -0,0 +1,6 @@
+<configuration>
+    <property>
+        <name>mapreduce.framework.name</name>
+        <value>yarn</value>
+    </property>
+</configuration>
--- a/docker/test/integration/hive_server/prepare_hive_data.sh
+++ b/docker/test/integration/hive_server/prepare_hive_data.sh
@ -0,0 +1,6 @@
+#!/bin/bash
+hive -e "create database test"
+
+hive -e "create table test.demo(id string, score int) PARTITIONED BY(day string) ROW FORMAT SERDE   'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'  STORED AS INPUTFORMAT  'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat' OUTPUTFORMAT  'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat'; create table test.demo_orc(id string, score int) PARTITIONED BY(day string) ROW FORMAT SERDE   'org.apache.hadoop.hive.ql.io.orc.OrcSerde'  STORED AS INPUTFORMAT  'org.apache.hadoop.hive.ql.io.orc.OrcInputFormat' OUTPUTFORMAT  'org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat'; "
+hive -e "create table test.demo_text(id string, score int, day string)row format delimited fields terminated by ','; load data local inpath '/demo_data.txt' into table test.demo_text "
+ hive -e "set hive.exec.dynamic.partition.mode=nonstrict;insert into test.demo partition(day) select * from test.demo_text; insert into test.demo_orc partition(day) select * from test.demo_text"
--- a/docker/test/integration/hive_server/start.sh
+++ b/docker/test/integration/hive_server/start.sh
@ -0,0 +1,12 @@
+service ssh start
+sed s/HOSTNAME/$HOSTNAME/ /hadoop-3.1.0/etc/hadoop/core-site.xml.template > /hadoop-3.1.0/etc/hadoop/core-site.xml
+hadoop namenode -format
+start-all.sh
+service mysql start
+mysql -u root -e "CREATE USER \"test\"@\"localhost\" IDENTIFIED BY \"test\""
+mysql -u root -e "GRANT ALL  ON * . * TO 'test'@'localhost'"
+schematool -initSchema -dbType mysql
+#nohup hiveserver2 &
+nohup hive --service metastore &
+bash /prepare_hive_data.sh
+while true; do sleep 1000; done
--- a/docker/test/integration/hive_server/yarn-site.xml
+++ b/docker/test/integration/hive_server/yarn-site.xml
@ -0,0 +1,32 @@
+<configuration>
+    <property>
+        <name>yarn.nodemanager.aux-services</name>
+        <value>mapreduce_shuffle</value>
+    </property>
+
+    <property>
+      <name>yarn.application.classpath</name>
+      <value>/hadoop-3.1.0/etc/hadoop,/hadoop-3.1.0/share/hadoop/common/*,/hadoop-3.1.0/share/hadoop/common/lib/*,/hadoop-3.1.0/share/hadoop/hdfs/*, /hadoop-3.1.0/share/hadoop/hdfs/lib/*, /hadoop-3.1.0/share/hadoop/mapreduce/*, /hadoop-3.1.0/share/hadoop/mapreduce/lib/*, /hadoop-3.1.0/share/hadoop/yarn/*, /hadoop-3.1.0/share/hadoop/yarn/lib/*</value>
+    </property>
+
+    <property>
+    <description>
+      Number of seconds after an application finishes before the nodemanager's
+      DeletionService will delete the application's localized file directory
+      and log directory.
+
+      To diagnose Yarn application problems, set this property's value large
+      enough (for example, to 600 = 10 minutes) to permit examination of these
+      directories. After changing the property's value, you must restart the
+      nodemanager in order for it to have an effect.
+
+      The roots of Yarn applications' work directories is configurable with
+      the yarn.nodemanager.local-dirs property (see below), and the roots
+      of the Yarn applications' log directories is configurable with the
+      yarn.nodemanager.log-dirs property (see also below).
+    </description>
+    <name>yarn.nodemanager.delete.debug-delay-sec</name>
+    <value>600</value>
+  </property>
+
+</configuration>
--- a/docker/test/integration/runner/compose/docker_compose_hive.yml
+++ b/docker/test/integration/runner/compose/docker_compose_hive.yml
@ -0,0 +1,7 @@
+version: '2.3'
+services:
+    hdfs1:
+        image: lgboustc/hive_test:v1.0
+        hostname: hivetest
+        restart: always
+        entrypoint: bash /start.sh
--- a/tests/integration/helpers/cluster.py
+++ b/tests/integration/helpers/cluster.py
@ -750,17 +750,24 @@ class ClickHouseCluster:
                                    '--file', p.join(docker_compose_yml_dir, 'docker_compose_nginx.yml')]
        return self.base_nginx_cmd

+    def setup_hive(self, instance, env_variables, docker_compose_yml_dir):
+        self.with_hive = True
+        self.base_cmd.extend(['--file', p.join(docker_compose_yml_dir, 'docker_compose_hive.yml')])
+        self.base_hive_cmd = ['docker-compose', '--env-file', instance.env_file, '--project-name', self.project_name,
+                                    '--file', p.join(docker_compose_yml_dir, 'docker_compose_hive.yml')]
+        return self.base_hive_cmd
+
    def add_instance(self, name, base_config_dir=None, main_configs=None, user_configs=None, dictionaries=None,
                     macros=None, with_zookeeper=False, with_zookeeper_secure=False,
                     with_mysql_client=False, with_mysql=False, with_mysql8=False, with_mysql_cluster=False,
                     with_kafka=False, with_kerberized_kafka=False, with_rabbitmq=False, clickhouse_path_dir=None,
                     with_odbc_drivers=False, with_postgres=False, with_postgres_cluster=False, with_hdfs=False,
                     with_kerberized_hdfs=False, with_mongo=False, with_mongo_secure=False, with_nginx=False,
-                     with_redis=False, with_minio=False, with_cassandra=False, with_jdbc_bridge=False,
+                     with_redis=False, with_minio=False, with_cassandra=False, with_jdbc_bridge=False, with_hive=False,
                     hostname=None, env_variables=None, image="clickhouse/integration-test", tag=None,
                     stay_alive=False, ipv4_address=None, ipv6_address=None, with_installed_binary=False, external_dirs=None, tmpfs=None,
                     zookeeper_docker_compose_path=None, minio_certs_dir=None, use_keeper=True,
-                     main_config_name="config.xml", users_config_name="users.xml", copy_common_configs=True, config_root_name="clickhouse"):
+                     main_config_name="config.xml", users_config_name="users.xml", copy_common_configs=True, config_root_name="clickhouse", other_configs=[]):

        """Add an instance to the cluster.

@ -814,6 +821,7 @@ class ClickHouseCluster:
            with_minio=with_minio,
            with_cassandra=with_cassandra,
            with_jdbc_bridge=with_jdbc_bridge,
+            with_hive = with_hive,
            server_bin_path=self.server_bin_path,
            odbc_bridge_bin_path=self.odbc_bridge_bin_path,
            library_bridge_bin_path=self.library_bridge_bin_path,
@ -834,7 +842,8 @@ class ClickHouseCluster:
            copy_common_configs=copy_common_configs,
            external_dirs=external_dirs,
            tmpfs=tmpfs or [],
-            config_root_name=config_root_name)
+            config_root_name=config_root_name,
+            other_configs = other_configs)

        docker_compose_yml_dir = get_docker_compose_path()

@ -927,6 +936,9 @@ class ClickHouseCluster:
        if with_jdbc_bridge and not self.with_jdbc_bridge:
            cmds.append(self.setup_jdbc_bridge_cmd(instance, env_variables, docker_compose_yml_dir))

+        if with_hive:
+            cmds.append(self.setup_hive(instance, env_variables, docker_compose_yml_dir))
+
        logging.debug("Cluster name:{} project_name:{}. Added instance name:{} tag:{} base_cmd:{} docker_compose_yml_dir:{}".format(
            self.name, self.project_name, name, tag, self.base_cmd, docker_compose_yml_dir))
        return instance
@ -1588,6 +1600,12 @@ class ClickHouseCluster:
                self.up_called = True
                time.sleep(10)

+            if self.with_hive and self.base_hive_cmd:
+                logging.debug('Setup hive')
+                subprocess_check_call(self.base_hive_cmd + common_opts)
+                self.up_called = True
+                time.sleep(300)
+
            if self.with_minio and self.base_minio_cmd:
                # Copy minio certificates to minio/certs
                os.mkdir(self.minio_dir)
@ -1823,13 +1841,13 @@ class ClickHouseInstance:
            self, cluster, base_path, name, base_config_dir, custom_main_configs, custom_user_configs,
            custom_dictionaries,
            macros, with_zookeeper, zookeeper_config_path, with_mysql_client,  with_mysql, with_mysql8, with_mysql_cluster, with_kafka, with_kerberized_kafka,
-            with_rabbitmq, with_nginx, with_kerberized_hdfs, with_mongo, with_redis, with_minio, with_jdbc_bridge,
+            with_rabbitmq, with_nginx, with_kerberized_hdfs, with_mongo, with_redis, with_minio, with_jdbc_bridge,with_hive,
            with_cassandra, server_bin_path, odbc_bridge_bin_path, library_bridge_bin_path, clickhouse_path_dir, with_odbc_drivers, with_postgres, with_postgres_cluster,
            clickhouse_start_command=CLICKHOUSE_START_COMMAND,
            main_config_name="config.xml", users_config_name="users.xml", copy_common_configs=True,
            hostname=None, env_variables=None,
            image="clickhouse/integration-test", tag="latest",
-            stay_alive=False, ipv4_address=None, ipv6_address=None, with_installed_binary=False, external_dirs=None, tmpfs=None, config_root_name="clickhouse"):
+            stay_alive=False, ipv4_address=None, ipv6_address=None, with_installed_binary=False, external_dirs=None, tmpfs=None, config_root_name="clickhouse", other_configs=[]):

        self.name = name
        self.base_cmd = cluster.base_cmd
@ -1843,6 +1861,7 @@ class ClickHouseInstance:
        self.custom_main_config_paths = [p.abspath(p.join(base_path, c)) for c in custom_main_configs]
        self.custom_user_config_paths = [p.abspath(p.join(base_path, c)) for c in custom_user_configs]
        self.custom_dictionaries_paths = [p.abspath(p.join(base_path, c)) for c in custom_dictionaries]
+        self.other_custom_config_paths = [p.abspath(p.join(base_path,c)) for c in other_configs]
        self.clickhouse_path_dir = p.abspath(p.join(base_path, clickhouse_path_dir)) if clickhouse_path_dir else None
        self.kerberos_secrets_dir = p.abspath(p.join(base_path, 'secrets'))
        self.macros = macros if macros is not None else {}
@ -2376,6 +2395,8 @@ class ClickHouseInstance:
        os.mkdir(users_d_dir)
        dictionaries_dir = p.abspath(p.join(instance_config_dir, 'dictionaries'))
        os.mkdir(dictionaries_dir)
+        other_conf_dir = p.abspath(p.join(instance_config_dir, 'other_conf.d'))
+        os.mkdir(other_conf_dir)

        def write_embedded_config(name, dest_dir, fix_log_level=False):
            with open(p.join(HELPERS_DIR, name), 'r') as f:
@ -2422,6 +2443,8 @@ class ClickHouseInstance:
        # Copy dictionaries configs to configs/dictionaries
        for path in self.custom_dictionaries_paths:
            shutil.copy(path, dictionaries_dir)
+        for path in self.other_custom_config_paths:
+            shutil.copy(path, other_conf_dir)

        db_dir = p.abspath(p.join(self.path, 'database'))
        logging.debug(f"Setup database dir {db_dir}")
--- a/tests/integration/test_hive_query/init.py
+++ b/tests/integration/test_hive_query/init.py
--- a/tests/integration/test_hive_query/configs/config.xml
+++ b/tests/integration/test_hive_query/configs/config.xml
@ -0,0 +1,22 @@
+<clickhouse>
+    <remote_servers  >
+        <!-- Test only shard config for testing distributed storage -->
+        <simple>
+            <shard>
+                <replica>
+                    <host>localhost</host>
+                    <port>9000</port>
+                </replica>
+            </shard>
+        </simple>
+    </remote_servers>
+
+    <local_cache_dir>/clickhouse_local_cache</local_cache_dir>
+	<local_cache_quota>207374182400</local_cache_quota>
+	<local_cache_max_threads>1000</local_cache_max_threads>
+    
+    <hdfs>
+	    <libhdfs3_conf>/etc/clickhouse-server/other_conf.d/hdfs-site.xml</libhdfs3_conf>
+    </hdfs>
+
+</clickhouse>
--- a/tests/integration/test_hive_query/configs/hdfs-site.xml
+++ b/tests/integration/test_hive_query/configs/hdfs-site.xml
@ -0,0 +1,6 @@
+<configuration>
+    <property>
+        <name>dfs.replication</name>
+        <value>1</value>
+    </property>
+</configuration>
--- a/tests/integration/test_hive_query/test.py
+++ b/tests/integration/test_hive_query/test.py
@ -0,0 +1,91 @@
+import logging
+import os
+
+import pytest
+from helpers.cluster import ClickHouseCluster
+from helpers.test_tools import TSV
+
+logging.getLogger().setLevel(logging.INFO)
+logging.getLogger().addHandler(logging.StreamHandler())
+
+SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__))
+
+
+@pytest.fixture(scope="module")
+def started_cluster():
+    try:
+        cluster = ClickHouseCluster(__file__)
+        cluster.add_instance('h0_0_0', main_configs=['configs/config.xml'], other_configs=[ 'configs/hdfs-site.xml'], with_hive=True)
+        
+        logging.info("Starting cluster ...")
+        cluster.start()
+        yield cluster
+    finally:
+        cluster.shutdown()
+
+def test_create_parquet_table(started_cluster):
+    logging.info('Start testing creating hive table ...')
+    node = started_cluster.instances['h0_0_0']
+    result = node.query("""
+    CREATE TABLE default.demo_parquet (`id` Nullable(String), `score` Nullable(Int32), `day` Nullable(String)) ENGINE = Hive('thrift://hivetest:9083', 'test', 'demo') PARTITION BY(day)
+            """)
+    logging.info("create result {}".format(result))
+ 
+    assert result.strip() == ''
+
+def test_create_orc_table(started_cluster):
+    logging.info('Start testing creating hive table ...')
+    node = started_cluster.instances['h0_0_0']
+    result = node.query("""
+    CREATE TABLE default.demo_orc (`id` Nullable(String), `score` Nullable(Int32), `day` Nullable(String)) ENGINE = Hive('thrift://hivetest:9083', 'test', 'demo_orc') PARTITION BY(day)
+            """)
+    logging.info("create result {}".format(result))
+    
+    assert result.strip() == ''
+
+def test_create_text_table(started_cluster):
+    logging.info('Start testing creating hive table ...')
+    node = started_cluster.instances['h0_0_0']
+    result = node.query("""
+    CREATE TABLE default.demo_text (`id` Nullable(String), `score` Nullable(Int32), `day` Nullable(String)) ENGINE = Hive('thrift://hivetest:9083', 'test', 'demo_text') PARTITION BY (tuple())
+            """)
+    logging.info("create result {}".format(result))
+    
+    assert result.strip() == ''
+
+def test_parquet_groupby(started_cluster):
+    logging.info('Start testing groupby ...')
+    node = started_cluster.instances['h0_0_0']
+    result = node.query("""
+    SELECT day, count(*) FROM default.demo_parquet group by day order by day
+            """)
+    expected_result = """2021-11-01	1
+2021-11-05	2
+2021-11-11	1
+2021-11-16	2
+"""
+    assert result == expected_result
+def test_orc_groupby(started_cluster):
+    logging.info('Start testing groupby ...')
+    node = started_cluster.instances['h0_0_0']
+    result = node.query("""
+    SELECT day, count(*) FROM default.demo_orc group by day order by day
+            """)
+    expected_result = """2021-11-01	1
+2021-11-05	2
+2021-11-11	1
+2021-11-16	2
+"""
+    assert result == expected_result
+
+def test_text_count(started_cluster):
+    node = started_cluster.instances['h0_0_0']
+    result = node.query("""
+    SELECT day, count(*) FROM default.demo_orc group by day order by day SETTINGS format_csv_delimiter = '\x01'
+            """)
+    expected_result = """2021-11-01	1
+2021-11-05	2
+2021-11-11	1
+2021-11-16	2
+"""
+    assert result == expected_result