starter-kit/bots/tests-policy

#!/usr/bin/env python3

# This file is part of Cockpit.
#
# Copyright (C) 2013 Red Hat, Inc.
#
# Cockpit is free software; you can redistribute it and/or modify it
# under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation; either version 2.1 of the License, or
# (at your option) any later version.
#
# Cockpit is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with Cockpit; If not, see <http://www.gnu.org/licenses/>.

import argparse
import datetime
import fnmatch
import json
import re
import os
import socket
import sys
import time
import traceback
import urllib

sys.dont_write_bytecode = True

FLAKE_THRESHOLD = 0.4

from task import github
try:
    from machine.testvm import get_test_image
except ImportError:
    # fallback for unit tests where libvirt is not available
    get_test_image = lambda i: i

BOTS = os.path.dirname(os.path.realpath(__file__))

def main():
    script = os.path.basename(__file__)
    parser = argparse.ArgumentParser(description='Check a traceback for a known issue')
    parser.add_argument('-o', "--offline", action='store_true',
            help="Work offline, don't fetch new data or contact servers")
    parser.add_argument("--simple", action="store_true",
            default=(script == "image-naughty"),
            help="Just print out a known issue number")
    parser.add_argument("-v", "--verbose", action="store_true",
            help="Print out more verbose logging info")
    parser.add_argument('context', help="The image to check against")
    opts = parser.parse_args()

    base = None
    if os.environ.get("GITHUB_KNOWN_ISSUE_BASE"):
        netloc = os.environ.get("GITHUB_API", "https://api.github.com")
        base = "{0}/repos/{1}/".format(netloc, os.environ.get("GITHUB_KNOWN_ISSUE_BASE"))

    api = None if opts.offline else github.GitHub(base=base)

    context = opts.context
    if "/" not in context:
        context = "verify/{0}".format(context)
    prefix, _, image = context.partition("/")
    image, _, scenario = image.partition("/")

    try:
        output = sys.stdin.read()
        number = None

        if image and output:
            number = check_known_issue(api, output, image)

        if number and api and api.available:
            try:
                post_github(api, number, output, image)
            except (socket.error, RuntimeError):
                traceback.print_exc()
                sys.stderr.write("{0}: posting update to GitHub failed\n".format(script))
                # Fall through

        # Simple single number output
        if opts.simple:
            if number:
                sys.stdout.write("{0}\n".format(number))
            return 0

        # Otherwise we filter the output and write it back
        if number:
            output = filterSkip(output, "# SKIP Known issue #{0}".format(number))
        elif checkRetry(output):
            output = filterRetry(output, "# RETRY due to failure of test harness or framework")
        elif not opts.offline:
            output = guessFlake(output, context, opts.verbose)

        sys.stdout.write(output)
        return 0

    except RuntimeError as ex:
        sys.stderr.write("{0}: {1}\n".format(script, ex))
        return 1

# -----------------------------------------------------------------------------
# TAP Parsing

# Update TAP output failure and put a skip message
# in the appropriate place
def filterSkip(output, skip):
    lines = output.split("\n")
    for i, line in enumerate(lines):
        if line.startswith("not ok "):
            lines[i] = line[4:] + " " + skip
    return "\n".join(lines)

# Update TAP output failure and put a retry message
# in the appropriate place
def filterRetry(output, retry):
    lines = output.split("\n")
    for i, line in enumerate(lines):
        if line.startswith("not ok "):
            lines[i] = line + " " + retry
    return "\n".join(lines)

# Figure out the name from a failed test
def parseName(output):
    for line in output.split("\n"):
        if line.startswith("not ok "):
            line = line[7:]
            while line[0].isspace() or line[0].isdigit():
                line = line[1:]
            (name, delim, directive) = line.partition("#")
            (name, delim, directive) = name.partition("duration")
            name = name.strip()
            return name
    return ""

# -----------------------------------------------------------------------------
# Flakiness Checks

def guessFlake(output, context, verbose=False):
    # Build up an item just like in tests-data
    item = {
        "pull": None,
        "revision": os.environ.get("TEST_REVISION"),
        "status": "failure",
        "context": context,
        "date": time.strftime("%Y-%m-%dT%H:%M:%SZ"),
        "merged": None,
        "test": parseName(output),
        "url": None,
        "log": output
    }

    host = os.environ.get("COCKPIT_LEARN_SERVICE_HOST", "learn-cockpit.apps.ci.centos.org")
    port = os.environ.get("COCKPIT_LEARN_SERVICE_PORT", "443")
    response = { }
    flake = False

    url = "{0}://{1}:{2}/predict".format("https" if port == "443" else "http", host, port)
    jsonl = json.dumps(item) + "\n"
    try:
        data = urllib.request.urlopen(url, data=jsonl.encode('utf-8')).read()
    except (ConnectionResetError, urllib.error.URLError, socket.gaierror) as ex:
        sys.stderr.write("{0}: {1}\n".format(url, ex))
        data = b"{ }"
    try:
        response = json.loads(data.decode('utf-8'))
    except ValueError as ex:
        sys.stderr.write("{0}: {1}\n".format(url, ex))

    if verbose and response:
        sys.stderr.write(json.dumps(response, indent=4) + "\n")

    def count(record, field, only):
        values = record.get(field, [])
        for value, count in values:
            if value != only:
                continue
            return count
        return 0

    merged = count(response, "merged", True)
    not_merged = count(response, "merged", False)
    null_merged = count(response, "merged", None)

    score = 0
    total = merged + not_merged + null_merged
    if total:
        score = (merged / total)
    if score > FLAKE_THRESHOLD:
        output += "\n# Flake likely: {0:.3f} (clustering). Please file issue if this not a flake\n".format(score)
        flake = True
    elif merged > 0:
        output += "\n# Flake probability: {0:.3f} (clustering)\n".format(score)
        if "trackers" in response:
            tracker = response["trackers"][0]
            output += "# Flake tracked {0} likelihood {1:.3f}\n".format(tracker[0], tracker[1])

    if flake:
        output = filterRetry(output, "# RETRY due to flakiness")
    return output

# -----------------------------------------------------------------------------
# Retry policy

def checkRetry(trace):
    # We check for persistent but test harness or framework specific
    # failures that otherwise cause flakiness and false positives.
    #
    # The things we check here must:
    #  * have no impact on users of Cockpit in the real world
    #  * be things we tried to resolve in other ways. This is a last resort
    #

    trace = normalize_traceback(trace)

    # HACK: Interacting with sshd during boot is not always predictable
    # We're using an implementation detail of the server as our "way in" for testing.
    # This often has to do with sshd being restarted for some reason
    if "SSH master process exited with code: 255" in trace:
        return True

    # HACK: Intermittently the new libvirt machine won't get an IP address
    # or SSH will completely fail to start. We've tried various approaches
    # to minimize this, but it happens every 100,000 tests or so
    if "Failure: Unable to reach machine " in trace:
        return True

    # HACK: For when the verify machine runs out of available processes
    # We should retry this test process
    if "self.pid = os.fork()\nOSError: [Errno 11] Resource temporarily unavailable" in trace:
        return True

    return False


# -----------------------------------------------------------------------------
# Known Issue Matching and Filing

def normalize_traceback(trace):
    # All file paths converted to basename
    trace = re.sub(r'File "[^"]*/([^/"]+)"', 'File "\\1"', trace.strip())

    # replace noise in SELinux violations
    trace = re.sub('audit\([0-9.:]+\)', 'audit(0)', trace)
    trace = re.sub(r'\b(pid|ino)=[0-9]+ ', r'\1=0 ', trace)

    # in Python 3, testlib.Error is shown with namespace
    trace = re.sub('testlib\.Error', 'Error', trace)
    return trace

def check_known_issue(api, trace, image):
    image_naughty = os.path.join(BOTS, "naughty", get_test_image(image))
    if not os.path.exists(image_naughty):
        return None

    trace = normalize_traceback(trace)
    number = None
    for naughty in os.listdir(image_naughty):
        (prefix, unused, name) = naughty.partition("-")
        n = int(prefix)
        with open(os.path.join(image_naughty, naughty), "r") as fp:
            match = "*" + normalize_traceback(fp.read()) + "*"
        # Match as in a file name glob, albeit multi line, and account for literal pastes with '[]'
        if fnmatch.fnmatchcase(trace, match) or fnmatch.fnmatchcase(trace, match.replace("[", "?")):
            number = n
    return number


# Update a known issue thread on GitHub
#
# The idea is to combine repeated errors into fewer comments by
# editing them and keeping all relevant information.
#
# For this we keep one comment per context (e.g. 'verify/fedora-atomic')
# and divide that into sections, one each per error description / trace.
# In each section, we keep the error description / trace as well as
# the number of recorded events, the first occurrence and the last 10
# occurrences.
# For each (listed) occurrence we display the timestamp and some details
# provided by the caller, such as a revision or link to log files.
# The details can't contain newline characters and should be brief
def update_known_issue(api, number, err, details, context, timestamp=None):
    timestamp = timestamp or datetime.datetime.now().isoformat()

    link = timestamp
    if details:
        link = "{0} | {1}".format(timestamp, details)

    comments = issue_comments(api, number)

    # try to find an existing comment to update; extract the traceback from the
    # whole output; also ensure to remove the "# duration: XXs" trailer
    err_key = normalize_traceback(err).strip()
    m = re.search("^(Traceback.*^not ok[^#\\n]*)", err_key, re.S | re.M)
    if m:
        err_key = m.group(1)
    comment_key = "{0}\n".format(context)
    latest_occurrences = "Latest occurrences:\n\n"
    for comment in reversed(comments):
        if 'body' in comment and comment['body'].startswith(comment_key):
            parts = comment['body'].split("<hr>")
            updated = False
            for part_idx, part in enumerate(parts):
                part = normalize_traceback(part).strip()
                if err_key in part:
                    latest = part.split(latest_occurrences)
                    if len(latest) < 2:
                        sys.stderr.write("Error while parsing latest occurrences\n")
                    else:
                        # number of times this error was recorded
                        header = latest[0].split("\n")
                        for header_idx, entry in enumerate(header):
                            if entry.startswith("Times recorded: "):
                                rec_entries = entry.split(" ")
                                rec_entries[-1] = str(int(rec_entries[-1]) + 1)
                                header[header_idx] = " ".join(rec_entries)
                        latest[0] = "\n".join(header)
                        # list of recent occurrences
                        occurrences = [_f for _f in latest[1].split("\n") if _f]
                        occurrences.append("- {0}\n".format(link))
                        # only keep the last 10
                        if len(occurrences) > 10:
                            occurrences.pop(0)
                        parts[part_idx] = "{0}{1}{2}".format(latest[0], latest_occurrences, "\n".join(occurrences))
                        updated = True
                    break

            if updated:
                # shuffle the updated part to the end
                assert len(parts) > part_idx
                parts.append(parts[part_idx])
                del parts[part_idx]

            else:
                # add a new part
                while len(parts) > 10: # maximum 10 traces
                    parts.pop()

                parts.append("""
```
{0}
```
First occurrence: {1}
Times recorded: 1
{2}- {1}
""".format(err.strip(), link, latest_occurrences))
                updated = True

            # update comment, no need to check others
            body = "<hr>\n".join(parts)

            # ensure that the body is not longer than 64k.
            # drop earlier parts if we need to.
            while len(body) >= 65536:
                parts.pop(1) # parts[0] is the header

                body = "<hr>\n".join(parts)

            return api.patch("issues/comments/{0}".format(comment['id']), { "body": body })

    # create a new comment, since we didn't find one to update

    data = { "body": """{0}\nOoops, it happened again<hr>
```
{1}
```
First occurrence: {2}
Times recorded: 1
{3}- {2}
""".format(context, err.strip(), link, latest_occurrences) }
    return api.post("issues/{0}/comments".format(number), data)

def issue_comments(api, number):
    result = [ ]
    page = 1
    count = 100
    while count == 100:
        comments = api.get("issues/{0}/comments?page={1}&per_page={2}".format(number, page, count))
        count = 0
        page += 1
        if comments:
            result += comments
            count = len(comments)
    return result


def post_github(api, number, output, image):

    # Ignore this if we were not given a token
    if not api or not api.available:
        return

    context = "verify/{0}".format(image)

    # Lookup the link being logged to
    link = None
    revision = os.environ.get("TEST_REVISION", None)
    if revision:
        link = "revision {0}".format(revision)
        statuses = api.get("commits/{0}/statuses".format(revision))
        if statuses:
            for status in statuses:
                if status["context"] == context:
                    link = "revision {0}, [logs]({1})".format(revision, status["target_url"])
                    break
    update_known_issue(api, number, output, link, context)

if __name__ == '__main__':
    sys.exit(main())