diff --git a/qhub/deploy.py b/qhub/deploy.py index 6d3a898430..1390f63472 100644 --- a/qhub/deploy.py +++ b/qhub/deploy.py @@ -22,6 +22,10 @@ logger = logging.getLogger(__name__) +# check and retry settings +NUM_ATTEMPTS = 10 +TIMEOUT = 10 # seconds + def deploy_configuration( config, @@ -433,20 +437,25 @@ def provision_04_kubernetes_ingress(stage_outputs, config, check=True): def check_04_kubernetes_ingress(stage_outputs, qhub_config): directory = "stages/04-kubernetes-ingress" - def _attempt_tcp_connect(host, port, num_attempts=3, timeout=5): - # normalize hostname to ip address - host = socket.gethostbyname(host) - + def _attempt_tcp_connect(host, port, num_attempts=NUM_ATTEMPTS, timeout=TIMEOUT): for i in range(num_attempts): s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) - s.settimeout(5) - result = s.connect_ex((host, port)) - if result == 0: - print(f"Attempt {i+1} succedded to connect to tcp://{host}:{port}") - return True - s.close() - print(f"Attempt {i+1} failed to connect to tcp tcp://{host}:{port}") + try: + # normalize hostname to ip address + ip = socket.gethostbyname(host) + s.settimeout(5) + result = s.connect_ex((ip, port)) + if result == 0: + print(f"Attempt {i+1} succedded to connect to tcp://{ip}:{port}") + return True + print(f"Attempt {i+1} failed to connect to tcp tcp://{ip}:{port}") + except socket.gaierror: + print(f"Attempt {i+1} failed to get IP for {host}...") + finally: + s.close() + time.sleep(timeout) + return False tcp_ports = { @@ -459,6 +468,7 @@ def _attempt_tcp_connect(host, port, num_attempts=3, timeout=5): } ip_or_name = stage_outputs[directory]["load_balancer_address"]["value"] host = ip_or_name["hostname"] or ip_or_name["ip"] + host = host.strip("\n") for port in tcp_ports: if not _attempt_tcp_connect(host, port): @@ -512,17 +522,19 @@ def provision_ingress_dns( ) if check: - check_ingress_dns(stage_outputs, config) + check_ingress_dns(stage_outputs, config, disable_prompt) -def check_ingress_dns(stage_outputs, config): +def check_ingress_dns(stage_outputs, config, disable_prompt): directory = "stages/04-kubernetes-ingress" ip_or_name = stage_outputs[directory]["load_balancer_address"]["value"] ip = socket.gethostbyname(ip_or_name["hostname"] or ip_or_name["ip"]) domain_name = config["domain"] - def _attempt_dns_lookup(domain_name, ip, num_attempts=12, timeout=5): + def _attempt_dns_lookup( + domain_name, ip, num_attempts=NUM_ATTEMPTS, timeout=TIMEOUT + ): for i in range(num_attempts): try: resolved_ip = socket.gethostbyname(domain_name) @@ -542,11 +554,25 @@ def _attempt_dns_lookup(domain_name, ip, num_attempts=12, timeout=5): time.sleep(timeout) return False - if not _attempt_dns_lookup(domain_name, ip): - print( - f"ERROR: After stage directory={directory} DNS domain={domain_name} does not point to ip={ip}" - ) - sys.exit(1) + attempt = 0 + while not _attempt_dns_lookup(domain_name, ip): + sleeptime = 60 * (2 ** attempt) + if not disable_prompt: + input( + f"After attempting to poll the DNS, the record for domain={domain_name} appears not to exist, " + f"has recently been updated, or has yet to fully propogate. This non-deterministic behavior is likely due to " + f"DNS caching and will likely resolve itself in a few minutes.\n\n\tTo poll the DNS again in {sleeptime} seconds " + f"[Press Enter].\n\n...otherwise kill the process and run the deployment again later..." + ) + + print(f"Will attempt to poll DNS again in {sleeptime} seconds...") + time.sleep(sleeptime) + attempt += 1 + if attempt == 5: + print( + f"ERROR: After stage directory={directory} DNS domain={domain_name} does not point to ip={ip}" + ) + sys.exit(1) def provision_05_kubernetes_keycloak(stage_outputs, config, check=True): @@ -589,8 +615,8 @@ def _attempt_keycloak_connection( realm_name, client_id, verify=False, - num_attempts=3, - timeout=5, + num_attempts=NUM_ATTEMPTS, + timeout=TIMEOUT, ): for i in range(num_attempts): try: @@ -665,8 +691,8 @@ def _attempt_keycloak_connection( client_id, qhub_realm, verify=False, - num_attempts=5, - timeout=10, + num_attempts=NUM_ATTEMPTS, + timeout=TIMEOUT, ): for i in range(num_attempts): try: @@ -784,7 +810,9 @@ def check_07_kubernetes_services(stage_outputs, config): urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) - def _attempt_connect_url(url, verify=False, num_attempts=3, timeout=5): + def _attempt_connect_url( + url, verify=False, num_attempts=NUM_ATTEMPTS, timeout=TIMEOUT + ): for i in range(num_attempts): response = requests.get(service_url, verify=verify) if response.status_code < 400: @@ -792,6 +820,7 @@ def _attempt_connect_url(url, verify=False, num_attempts=3, timeout=5): return True else: print(f"Attempt {i+1} health check failed for url={url}") + time.sleep(timeout) return False services = stage_outputs[directory]["service_urls"]["value"]