esxi_bad_compute_hosts

#!/usr/bin/env python3

from pprint import pprint
import re
import requests
import yaml
import dateutil.parser
from datetime import datetime
from datetime import timezone

MAX_HOURS = 24

COMMON_PATTERNS = (
    (
        "HOST_KAPUT",
        re.compile(r"'An error occurred while communicating with the remote host.'"),
    ),
    (
        "HOST_KAPUT",
        re.compile(
            r"'Unable to communicate with the remote host, since it is disconnected.'"
        ),
    ),
    (
        "HOST_KAPUT",
        re.compile(
            r'Cannot complete login due to an incorrect user name or password."'
        ),
    ),
)


def get_inventory(log_url):
    r = requests.get(log_url + "zuul-info/inventory.yaml")
    return yaml.safe_load(r.text)


def search_error_pattern(log_url):
    founds = {}
    failure_in_re = re.compile(
        ".*NOTICE: To resume at this test target, use the option: --start-at (\S+)"
    )
    r = requests.get(log_url + "job-output.txt")

    task_path = None

    for l in r.text.splitlines():
        if "task path:" in l:
            task_path = l.split(
                "/home/zuul/.ansible/collections/ansible_collections/community/vmware/"
            )[-1]

        for name, pattern in COMMON_PATTERNS:
            if pattern.search(l):
                if name not in founds:
                    founds[name] = 0
                    founds["task_path"] = task_path

                founds[name] += 1

        m = failure_in_re.search(l)
        if m:
            founds["task_path"] = task_path
            founds["failure_in"] = m.group(1)
    return founds


def iter_builds():
    jobs = [
        "ansible-test-cloud-integration-vcenter_1esxi_without_nested-python36_1_of_2",
        "ansible-test-cloud-integration-vcenter_1esxi_without_nested-python36_2_of_2",
        "ansible-test-cloud-integration-vcenter_1esxi_with_nested-python36",
        "ansible-test-cloud-integration-vcenter_2esxi_without_nested-python36",
    ]
    for job in jobs:
        url_template = "https://dashboard.zuul.ansible.com/api/tenant/ansible/builds?job_name={job}&limit=30"
        r = requests.get(url_template.format(job=job))
        builds = r.json()
        for build in builds:
            yield build


results_by_host_id = {}
results_by_region = {}
results_by_age = {}
host_by_host_id = {}
current = {}

for build in iter_builds():
    delta = datetime.now(timezone.utc) - dateutil.parser.parse(build["end_time"] + 'Z')
    age = int(delta.total_seconds() / 3600)
    if age > MAX_HOURS:
        continue

    if build["result"] in ("RETRY_LIMIT", "SKIPPED", "POST_FAILURE"):
        continue
    matches = None
    if build["result"] == "FAILURE":
        print("** https://github.com/ansible-collections/vmware/pull/{change}".format(
                **build
                ))

        matches = search_error_pattern(build["log_url"])
        if "HOST_KAPUT" not in matches:
            print("not matches for log_url: {log_url}".format(log_url=build["log_url"]))
            print(
                (
                    "The failure may not be an hypervisor stability problem. We "
                    "ignore it. ({failure_in} at {task_path})"
                ).format(
                    failure_in=matches.get("failure_in"),
                    task_path=matches.get("task_path"),
                )
            )
            continue
        print("Match: {log_url}".format(log_url=build["log_url"]))

    if not build["log_url"]:
        continue
    inventory = get_inventory(build["log_url"])
    cloud = inventory["all"]["hosts"]["esxi1"]["nodepool"]["cloud"]
    region = inventory["all"]["hosts"]["esxi1"]["nodepool"]["az"]
    host_id = inventory["all"]["hosts"]["esxi1"]["nodepool"]["host_id"]

    if age not in results_by_age:
        results_by_age[age] = {"SUCCESS": [], "FAILURE": []}
    results_by_age[age][build["result"]].append(
        {"log_url": build["log_url"], "matches": matches, "region": region}
    )

    if host_id not in results_by_host_id:
        host_by_host_id[host_id] = "{cloud}-{region}-{host_id}".format(
            cloud=cloud, region=region, host_id=host_id
        )
        results_by_host_id[host_id] = {"SUCCESS": [], "FAILURE": []}
    results_by_host_id[host_id][build["result"]].append(
        {"log_url": build["log_url"], "matches": matches}
    )

    if region not in results_by_region:
        results_by_region[region] = {"SUCCESS": [], "FAILURE": []}
    results_by_region[region][build["result"]].append(
        {"log_url": build["log_url"], "matches": matches}
    )
    if build["change"] not in current:
        current[build["change"]] = {
            "build": build,
            "log_url": build["log_url"],
            "matches": matches,
        }


print("BAD HOSTS (for last {max_hours})".format(max_hours=MAX_HOURS))
for host_id in results_by_host_id.keys():
    if len(results_by_host_id[host_id]["SUCCESS"]) < 2:
        # Not enough metrics
        continue

    rate = len(results_by_host_id[host_id]["SUCCESS"]) / (
        len(results_by_host_id[host_id]["SUCCESS"])
        + len(results_by_host_id[host_id]["FAILURE"])
    )

    if rate < 0.9:
        print(
            "name={name} (rate={rate})".format(name=host_by_host_id[host_id], rate=rate)
        )

print("GOOD HOSTS (for last {max_hours})".format(max_hours=MAX_HOURS))
for host_id in results_by_host_id.keys():
    if len(results_by_host_id[host_id]["SUCCESS"]) < 2:
        # Not enough metrics
        continue

    rate = len(results_by_host_id[host_id]["SUCCESS"]) / (
        len(results_by_host_id[host_id]["SUCCESS"])
        + len(results_by_host_id[host_id]["FAILURE"])
    )
    if rate > 0.9:
        print(
            "name={name} (rate={rate})".format(name=host_by_host_id[host_id], rate=rate)
        )

print("BY ZONE (for last {max_hours})".format(max_hours=MAX_HOURS))
for region, values in results_by_region.items():
    rate = len(values["SUCCESS"]) / (len(values["SUCCESS"]) + len(values["FAILURE"]))
    print("region={region} (rate={rate})".format(region=region, rate=rate))

print("BY AGE")
for age in sorted(results_by_age):
    values = results_by_age[age]
    rate = len(values["SUCCESS"]) / (len(values["SUCCESS"]) + len(values["FAILURE"]))
    print(
        "⏰ {age}h ago (rate={rate})".format(
            age=age, rate=rate, failures=values["FAILURE"]
        )
    )
    for failure in values["FAILURE"]:
        print("    - {failure})".format(age=age, rate=rate, failure=failure))

for job in current.values():
    if not job["matches"]:
        continue
    if "HOST_KAPUT" in job["matches"]:
        print(
            "Restart: https://github.com/ansible-collections/vmware/pull/{change}".format(
                **job["build"]
            )
        )