ClickHouse/tests/integration/helpers/hdfs_api.py

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

327 lines
11 KiB
Python
Raw Normal View History

# -*- coding: utf-8 -*-
import gzip
2024-09-27 10:19:39 +00:00
import io
import logging
import os
import socket
import subprocess
2024-09-27 10:19:39 +00:00
import tempfile
2020-09-10 10:02:46 +00:00
import time
from tempfile import NamedTemporaryFile
2024-09-27 10:19:39 +00:00
import requests
2020-09-10 10:02:46 +00:00
import requests_kerberos as reqkerb
2020-09-10 10:02:46 +00:00
class mk_krb_conf(object):
def __init__(self, krb_conf, kdc_ip):
self.krb_conf = krb_conf
self.kdc_ip = kdc_ip
self.amended_krb_conf = None
2020-09-10 10:02:46 +00:00
def __enter__(self):
with open(self.krb_conf) as f:
content = f.read()
amended_content = content.replace("hdfskerberos", self.kdc_ip)
self.amended_krb_conf = tempfile.NamedTemporaryFile(delete=False, mode="w+")
2020-09-10 10:02:46 +00:00
self.amended_krb_conf.write(amended_content)
self.amended_krb_conf.close()
return self.amended_krb_conf.name
2020-09-10 10:02:46 +00:00
def __exit__(self, type, value, traceback):
if self.amended_krb_conf is not None:
self.amended_krb_conf.close()
class HDFSApi(object):
2021-06-09 09:23:02 +00:00
def __init__(
self,
user,
host,
proxy_port,
data_port,
timeout=100,
kerberized=False,
principal=None,
2020-09-10 10:02:46 +00:00
keytab=None,
krb_conf=None,
2021-06-09 09:23:02 +00:00
protocol="http",
hdfs_ip=None,
kdc_ip=None,
):
2020-09-10 10:02:46 +00:00
self.host = host
self.protocol = protocol
self.proxy_port = proxy_port
self.data_port = data_port
self.user = user
2020-09-10 10:02:46 +00:00
self.kerberized = kerberized
self.principal = principal
self.keytab = keytab
self.timeout = timeout
self.hdfs_ip = hdfs_ip
self.kdc_ip = kdc_ip
self.krb_conf = krb_conf
# logging.basicConfig(level=logging.DEBUG)
# logging.getLogger().setLevel(logging.DEBUG)
2021-03-16 10:00:49 +00:00
# requests_log = logging.getLogger("requests.packages.urllib3")
# requests_log.setLevel(logging.INFO)
# requests_log.propagate = True
# kerb_log = logging.getLogger("requests_kerberos")
# kerb_log.setLevel(logging.DEBUG)
# kerb_log.propagate = True
2020-09-10 10:02:46 +00:00
if kerberized:
self._run_kinit()
2021-06-09 09:23:02 +00:00
self.kerberos_auth = reqkerb.HTTPKerberosAuth(
mutual_authentication=reqkerb.DISABLED,
hostname_override=self.host,
principal=self.principal,
)
if self.kerberos_auth is None:
print("failed to obtain kerberos_auth")
2020-09-10 10:02:46 +00:00
else:
self.kerberos_auth = None
def _run_kinit(self):
if self.principal is None or self.keytab is None:
raise Exception("kerberos principal and keytab are required")
with mk_krb_conf(self.krb_conf, self.kdc_ip) as instantiated_krb_conf:
2021-02-19 14:42:43 +00:00
logging.debug("instantiated_krb_conf {}".format(instantiated_krb_conf))
2020-09-10 10:02:46 +00:00
os.environ["KRB5_CONFIG"] = instantiated_krb_conf
cmd = "(kinit -R -t {keytab} -k {principal} || (sleep 5 && kinit -R -t {keytab} -k {principal})) ; klist".format(
instantiated_krb_conf=instantiated_krb_conf,
keytab=self.keytab,
principal=self.principal,
)
2020-09-10 10:02:46 +00:00
start = time.time()
while time.time() - start < self.timeout:
try:
2021-02-19 14:42:43 +00:00
res = subprocess.run(cmd, shell=True)
if res.returncode != 0:
# check_call(...) from subprocess does not print stderr, so we do it manually
logging.debug(
"Stderr:\n{}\n".format(res.stderr.decode("utf-8"))
)
2021-02-19 14:42:43 +00:00
logging.debug(
"Stdout:\n{}\n".format(res.stdout.decode("utf-8"))
)
2021-02-19 14:42:43 +00:00
raise Exception(
"Command {} return non-zero code {}: {}".format(
cmd, res.returncode, res.stderr.decode("utf-8")
)
)
2021-02-19 14:42:43 +00:00
logging.debug("KDC started, kinit successfully run")
2020-09-10 10:02:46 +00:00
return
except Exception as ex:
2021-02-19 14:42:43 +00:00
logging.debug("Can't run kinit ... waiting {}".format(str(ex)))
2020-09-10 10:02:46 +00:00
time.sleep(1)
raise Exception("Kinit running failure")
2021-06-09 09:23:02 +00:00
@staticmethod
def req_wrapper(func, expected_code, cnt=2, **kwargs):
2021-04-29 11:57:48 +00:00
for i in range(0, cnt):
2021-06-09 09:23:02 +00:00
logging.debug(f"CALL: {str(kwargs)}")
2021-04-29 11:57:48 +00:00
response_data = func(**kwargs)
2021-06-09 09:23:02 +00:00
logging.debug(
f"response_data:{response_data.content} headers:{response_data.headers}"
)
2021-04-29 11:57:48 +00:00
if response_data.status_code == expected_code:
return response_data
else:
2021-06-09 09:23:02 +00:00
logging.error(
f"unexpected response_data.status_code {response_data.status_code} != {expected_code}"
)
2021-07-05 03:32:56 +00:00
time.sleep(1)
response_data.raise_for_status()
2020-10-02 16:54:07 +00:00
def read_data(self, path, universal_newlines=True):
2021-06-09 13:53:16 +00:00
logging.debug(
"read_data protocol:{} host:{} ip:{} proxy port:{} data port:{} path: {}".format(
self.protocol,
self.host,
self.hdfs_ip,
self.proxy_port,
self.data_port,
path,
)
)
2021-06-09 13:53:16 +00:00
response = self.req_wrapper(
requests.get,
307,
url="{protocol}://{ip}:{port}/webhdfs/v1{path}?op=OPEN".format(
protocol=self.protocol, ip=self.hdfs_ip, port=self.proxy_port, path=path
),
headers={"host": str(self.hdfs_ip)},
allow_redirects=False,
verify=False,
auth=self.kerberos_auth,
)
# additional_params = '&'.join(response.headers['Location'].split('&')[1:2])
2021-04-29 11:57:48 +00:00
location = None
if self.kerberized:
2021-06-09 13:53:16 +00:00
location = response.headers["Location"].replace(
"kerberizedhdfs1:1006", "{}:{}".format(self.hdfs_ip, self.data_port)
)
2021-04-29 11:57:48 +00:00
else:
2021-06-09 09:23:02 +00:00
location = response.headers["Location"].replace(
"hdfs1:50075", "{}:{}".format(self.hdfs_ip, self.data_port)
)
2021-04-29 11:57:48 +00:00
logging.debug("redirected to {}".format(location))
2021-06-09 09:23:02 +00:00
response_data = self.req_wrapper(
requests.get,
200,
url=location,
headers={"host": self.hdfs_ip},
verify=False,
auth=self.kerberos_auth,
)
2021-04-29 11:57:48 +00:00
2020-10-02 16:54:07 +00:00
if universal_newlines:
return response_data.text
else:
return response_data.content
def write_data(self, path, content):
2021-02-19 15:08:38 +00:00
logging.debug(
"write_data protocol:{} host:{} port:{} path: {} user:{}, principal:{}".format(
self.protocol,
self.host,
self.proxy_port,
path,
self.user,
self.principal,
)
)
2020-10-02 16:54:07 +00:00
named_file = NamedTemporaryFile(mode="wb+")
fpath = named_file.name
2020-10-02 16:54:07 +00:00
if isinstance(content, str):
content = content.encode()
named_file.write(content)
named_file.flush()
response = self.req_wrapper(
requests.put,
307,
2021-06-09 09:23:02 +00:00
url="{protocol}://{ip}:{port}/webhdfs/v1{path}?op=CREATE".format(
protocol=self.protocol,
ip=self.hdfs_ip,
2021-04-29 11:57:48 +00:00
port=self.proxy_port,
path=path,
user=self.user,
),
allow_redirects=False,
2021-06-09 09:23:02 +00:00
headers={"host": str(self.hdfs_ip)},
2021-04-29 11:57:48 +00:00
params={"overwrite": "true"},
verify=False,
auth=self.kerberos_auth,
)
logging.debug("HDFS api response:{}".format(response.headers))
if self.kerberized:
2021-06-09 09:23:02 +00:00
location = response.headers["Location"].replace(
"kerberizedhdfs1:1006", "{}:{}".format(self.hdfs_ip, self.data_port)
)
2021-04-29 11:57:48 +00:00
else:
2021-06-09 09:23:02 +00:00
location = response.headers["Location"].replace(
"hdfs1:50075", "{}:{}".format(self.hdfs_ip, self.data_port)
)
2020-09-10 10:02:46 +00:00
2021-02-19 12:58:11 +00:00
with open(fpath, mode="rb") as fh:
file_data = fh.read()
protocol = "http" # self.protocol
response = self.req_wrapper(
2024-10-02 11:15:16 +00:00
requests.put,
201,
url="{location}".format(location=location),
data=file_data,
headers={"content-type": "text/plain", "host": str(self.hdfs_ip)},
params={"file": path, "user.name": self.user},
allow_redirects=False,
verify=False,
auth=self.kerberos_auth,
)
logging.debug(f"{response.content} {response.headers}")
2024-10-02 13:53:24 +00:00
def write_file(self, path, local_path):
2024-10-02 11:15:16 +00:00
logging.debug(
"write_data protocol:{} host:{} port:{} path: {} user:{}, principal:{}".format(
self.protocol,
self.host,
self.proxy_port,
path,
self.user,
self.principal,
)
)
response = self.req_wrapper(
requests.put,
307,
url="{protocol}://{ip}:{port}/webhdfs/v1{path}?op=CREATE".format(
protocol=self.protocol,
ip=self.hdfs_ip,
port=self.proxy_port,
path=path,
user=self.user,
),
allow_redirects=False,
headers={"host": str(self.hdfs_ip)},
params={"overwrite": "true"},
verify=False,
auth=self.kerberos_auth,
)
logging.debug("HDFS api response:{}".format(response.headers))
if self.kerberized:
location = response.headers["Location"].replace(
"kerberizedhdfs1:1006", "{}:{}".format(self.hdfs_ip, self.data_port)
)
else:
location = response.headers["Location"].replace(
"hdfs1:50075", "{}:{}".format(self.hdfs_ip, self.data_port)
)
with open(local_path, mode="rb") as fh:
file_data = fh.read()
protocol = "http" # self.protocol
response = self.req_wrapper(
requests.put,
201,
2021-04-29 11:57:48 +00:00
url="{location}".format(location=location),
data=file_data,
2021-06-09 09:23:02 +00:00
headers={"content-type": "text/plain", "host": str(self.hdfs_ip)},
2021-04-29 11:57:48 +00:00
params={"file": path, "user.name": self.user},
allow_redirects=False,
verify=False,
auth=self.kerberos_auth,
)
2021-06-09 09:23:02 +00:00
logging.debug(f"{response.content} {response.headers}")
2020-09-10 10:02:46 +00:00
def write_gzip_data(self, path, content):
2020-10-02 16:54:07 +00:00
if isinstance(content, str):
content = content.encode()
out = io.BytesIO()
with gzip.GzipFile(fileobj=out, mode="wb") as f:
f.write(content)
self.write_data(path, out.getvalue())
def read_gzip_data(self, path):
2020-10-02 16:54:07 +00:00
return (
gzip.GzipFile(
fileobj=io.BytesIO(self.read_data(path, universal_newlines=False))
)
2020-10-02 16:54:07 +00:00
.read()
.decode()
)