#!/usr/bin/env python3 # This file is part of Cockpit. # # Copyright (C) 2013 Red Hat, Inc. # # Cockpit is free software; you can redistribute it and/or modify it # under the terms of the GNU Lesser General Public License as published by # the Free Software Foundation; either version 2.1 of the License, or # (at your option) any later version. # # Cockpit is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public License # along with Cockpit; If not, see . import argparse import datetime import fnmatch import json import re import os import socket import sys import time import traceback import urllib sys.dont_write_bytecode = True FLAKE_THRESHOLD = 0.4 from task import github try: from machine.testvm import get_test_image except ImportError: # fallback for unit tests where libvirt is not available get_test_image = lambda i: i BOTS = os.path.dirname(os.path.realpath(__file__)) def main(): script = os.path.basename(__file__) parser = argparse.ArgumentParser(description='Check a traceback for a known issue') parser.add_argument('-o', "--offline", action='store_true', help="Work offline, don't fetch new data or contact servers") parser.add_argument("--simple", action="store_true", default=(script == "image-naughty"), help="Just print out a known issue number") parser.add_argument("-v", "--verbose", action="store_true", help="Print out more verbose logging info") parser.add_argument('context', help="The image to check against") opts = parser.parse_args() base = None if os.environ.get("GITHUB_KNOWN_ISSUE_BASE"): netloc = os.environ.get("GITHUB_API", "https://api.github.com") base = "{0}/repos/{1}/".format(netloc, os.environ.get("GITHUB_KNOWN_ISSUE_BASE")) api = None if opts.offline else github.GitHub(base=base) context = opts.context if "/" not in context: context = "verify/{0}".format(context) prefix, _, image = context.partition("/") image, _, scenario = image.partition("/") try: output = sys.stdin.read() number = None if image and output: number = check_known_issue(api, output, image) if number and api and api.available: try: post_github(api, number, output, image) except (socket.error, RuntimeError): traceback.print_exc() sys.stderr.write("{0}: posting update to GitHub failed\n".format(script)) # Fall through # Simple single number output if opts.simple: if number: sys.stdout.write("{0}\n".format(number)) return 0 # Otherwise we filter the output and write it back if number: output = filterSkip(output, "# SKIP Known issue #{0}".format(number)) elif checkRetry(output): output = filterRetry(output, "# RETRY due to failure of test harness or framework") elif not opts.offline: output = guessFlake(output, context, opts.verbose) sys.stdout.write(output) return 0 except RuntimeError as ex: sys.stderr.write("{0}: {1}\n".format(script, ex)) return 1 # ----------------------------------------------------------------------------- # TAP Parsing # Update TAP output failure and put a skip message # in the appropriate place def filterSkip(output, skip): lines = output.split("\n") for i, line in enumerate(lines): if line.startswith("not ok "): lines[i] = line[4:] + " " + skip return "\n".join(lines) # Update TAP output failure and put a retry message # in the appropriate place def filterRetry(output, retry): lines = output.split("\n") for i, line in enumerate(lines): if line.startswith("not ok "): lines[i] = line + " " + retry return "\n".join(lines) # Figure out the name from a failed test def parseName(output): for line in output.split("\n"): if line.startswith("not ok "): line = line[7:] while line[0].isspace() or line[0].isdigit(): line = line[1:] (name, delim, directive) = line.partition("#") (name, delim, directive) = name.partition("duration") name = name.strip() return name return "" # ----------------------------------------------------------------------------- # Flakiness Checks def guessFlake(output, context, verbose=False): # Build up an item just like in tests-data item = { "pull": None, "revision": os.environ.get("TEST_REVISION"), "status": "failure", "context": context, "date": time.strftime("%Y-%m-%dT%H:%M:%SZ"), "merged": None, "test": parseName(output), "url": None, "log": output } host = os.environ.get("COCKPIT_LEARN_SERVICE_HOST", "learn-cockpit.apps.ci.centos.org") port = os.environ.get("COCKPIT_LEARN_SERVICE_PORT", "443") response = { } flake = False url = "{0}://{1}:{2}/predict".format("https" if port == "443" else "http", host, port) jsonl = json.dumps(item) + "\n" try: data = urllib.request.urlopen(url, data=jsonl.encode('utf-8')).read() except (ConnectionResetError, urllib.error.URLError, socket.gaierror) as ex: sys.stderr.write("{0}: {1}\n".format(url, ex)) data = b"{ }" try: response = json.loads(data.decode('utf-8')) except ValueError as ex: sys.stderr.write("{0}: {1}\n".format(url, ex)) if verbose and response: sys.stderr.write(json.dumps(response, indent=4) + "\n") def count(record, field, only): values = record.get(field, []) for value, count in values: if value != only: continue return count return 0 merged = count(response, "merged", True) not_merged = count(response, "merged", False) null_merged = count(response, "merged", None) score = 0 total = merged + not_merged + null_merged if total: score = (merged / total) if score > FLAKE_THRESHOLD: output += "\n# Flake likely: {0:.3f} (clustering). Please file issue if this not a flake\n".format(score) flake = True elif merged > 0: output += "\n# Flake probability: {0:.3f} (clustering)\n".format(score) if "trackers" in response: tracker = response["trackers"][0] output += "# Flake tracked {0} likelihood {1:.3f}\n".format(tracker[0], tracker[1]) if flake: output = filterRetry(output, "# RETRY due to flakiness") return output # ----------------------------------------------------------------------------- # Retry policy def checkRetry(trace): # We check for persistent but test harness or framework specific # failures that otherwise cause flakiness and false positives. # # The things we check here must: # * have no impact on users of Cockpit in the real world # * be things we tried to resolve in other ways. This is a last resort # trace = normalize_traceback(trace) # HACK: Interacting with sshd during boot is not always predictable # We're using an implementation detail of the server as our "way in" for testing. # This often has to do with sshd being restarted for some reason if "SSH master process exited with code: 255" in trace: return True # HACK: Intermittently the new libvirt machine won't get an IP address # or SSH will completely fail to start. We've tried various approaches # to minimize this, but it happens every 100,000 tests or so if "Failure: Unable to reach machine " in trace: return True # HACK: For when the verify machine runs out of available processes # We should retry this test process if "self.pid = os.fork()\nOSError: [Errno 11] Resource temporarily unavailable" in trace: return True return False # ----------------------------------------------------------------------------- # Known Issue Matching and Filing def normalize_traceback(trace): # All file paths converted to basename trace = re.sub(r'File "[^"]*/([^/"]+)"', 'File "\\1"', trace.strip()) # replace noise in SELinux violations trace = re.sub('audit\([0-9.:]+\)', 'audit(0)', trace) trace = re.sub(r'\b(pid|ino)=[0-9]+ ', r'\1=0 ', trace) # in Python 3, testlib.Error is shown with namespace trace = re.sub('testlib\.Error', 'Error', trace) return trace def check_known_issue(api, trace, image): image_naughty = os.path.join(BOTS, "naughty", get_test_image(image)) if not os.path.exists(image_naughty): return None trace = normalize_traceback(trace) number = None for naughty in os.listdir(image_naughty): (prefix, unused, name) = naughty.partition("-") n = int(prefix) with open(os.path.join(image_naughty, naughty), "r") as fp: match = "*" + normalize_traceback(fp.read()) + "*" # Match as in a file name glob, albeit multi line, and account for literal pastes with '[]' if fnmatch.fnmatchcase(trace, match) or fnmatch.fnmatchcase(trace, match.replace("[", "?")): number = n return number # Update a known issue thread on GitHub # # The idea is to combine repeated errors into fewer comments by # editing them and keeping all relevant information. # # For this we keep one comment per context (e.g. 'verify/fedora-atomic') # and divide that into sections, one each per error description / trace. # In each section, we keep the error description / trace as well as # the number of recorded events, the first occurrence and the last 10 # occurrences. # For each (listed) occurrence we display the timestamp and some details # provided by the caller, such as a revision or link to log files. # The details can't contain newline characters and should be brief def update_known_issue(api, number, err, details, context, timestamp=None): timestamp = timestamp or datetime.datetime.now().isoformat() link = timestamp if details: link = "{0} | {1}".format(timestamp, details) comments = issue_comments(api, number) # try to find an existing comment to update; extract the traceback from the # whole output; also ensure to remove the "# duration: XXs" trailer err_key = normalize_traceback(err).strip() m = re.search("^(Traceback.*^not ok[^#\\n]*)", err_key, re.S | re.M) if m: err_key = m.group(1) comment_key = "{0}\n".format(context) latest_occurrences = "Latest occurrences:\n\n" for comment in reversed(comments): if 'body' in comment and comment['body'].startswith(comment_key): parts = comment['body'].split("
") updated = False for part_idx, part in enumerate(parts): part = normalize_traceback(part).strip() if err_key in part: latest = part.split(latest_occurrences) if len(latest) < 2: sys.stderr.write("Error while parsing latest occurrences\n") else: # number of times this error was recorded header = latest[0].split("\n") for header_idx, entry in enumerate(header): if entry.startswith("Times recorded: "): rec_entries = entry.split(" ") rec_entries[-1] = str(int(rec_entries[-1]) + 1) header[header_idx] = " ".join(rec_entries) latest[0] = "\n".join(header) # list of recent occurrences occurrences = [_f for _f in latest[1].split("\n") if _f] occurrences.append("- {0}\n".format(link)) # only keep the last 10 if len(occurrences) > 10: occurrences.pop(0) parts[part_idx] = "{0}{1}{2}".format(latest[0], latest_occurrences, "\n".join(occurrences)) updated = True break if updated: # shuffle the updated part to the end assert len(parts) > part_idx parts.append(parts[part_idx]) del parts[part_idx] else: # add a new part while len(parts) > 10: # maximum 10 traces parts.pop() parts.append(""" ``` {0} ``` First occurrence: {1} Times recorded: 1 {2}- {1} """.format(err.strip(), link, latest_occurrences)) updated = True # update comment, no need to check others body = "
\n".join(parts) # ensure that the body is not longer than 64k. # drop earlier parts if we need to. while len(body) >= 65536: parts.pop(1) # parts[0] is the header body = "
\n".join(parts) return api.patch("issues/comments/{0}".format(comment['id']), { "body": body }) # create a new comment, since we didn't find one to update data = { "body": """{0}\nOoops, it happened again
``` {1} ``` First occurrence: {2} Times recorded: 1 {3}- {2} """.format(context, err.strip(), link, latest_occurrences) } return api.post("issues/{0}/comments".format(number), data) def issue_comments(api, number): result = [ ] page = 1 count = 100 while count == 100: comments = api.get("issues/{0}/comments?page={1}&per_page={2}".format(number, page, count)) count = 0 page += 1 if comments: result += comments count = len(comments) return result def post_github(api, number, output, image): # Ignore this if we were not given a token if not api or not api.available: return context = "verify/{0}".format(image) # Lookup the link being logged to link = None revision = os.environ.get("TEST_REVISION", None) if revision: link = "revision {0}".format(revision) statuses = api.get("commits/{0}/statuses".format(revision)) if statuses: for status in statuses: if status["context"] == context: link = "revision {0}, [logs]({1})".format(revision, status["target_url"]) break update_known_issue(api, number, output, link, context) if __name__ == '__main__': sys.exit(main())