parent
044b8da55a
commit
759617c9c0
288 changed files with 13040 additions and 1 deletions
414
bots/tests-policy
Executable file
414
bots/tests-policy
Executable file
|
|
@ -0,0 +1,414 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
# This file is part of Cockpit.
|
||||
#
|
||||
# Copyright (C) 2013 Red Hat, Inc.
|
||||
#
|
||||
# Cockpit is free software; you can redistribute it and/or modify it
|
||||
# under the terms of the GNU Lesser General Public License as published by
|
||||
# the Free Software Foundation; either version 2.1 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# Cockpit is distributed in the hope that it will be useful, but
|
||||
# WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
# Lesser General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU Lesser General Public License
|
||||
# along with Cockpit; If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
import argparse
|
||||
import datetime
|
||||
import fnmatch
|
||||
import json
|
||||
import re
|
||||
import os
|
||||
import socket
|
||||
import sys
|
||||
import time
|
||||
import traceback
|
||||
import urllib
|
||||
|
||||
sys.dont_write_bytecode = True
|
||||
|
||||
FLAKE_THRESHOLD = 0.4
|
||||
|
||||
from task import github
|
||||
try:
|
||||
from machine.testvm import get_test_image
|
||||
except ImportError:
|
||||
# fallback for unit tests where libvirt is not available
|
||||
get_test_image = lambda i: i
|
||||
|
||||
BOTS = os.path.dirname(os.path.realpath(__file__))
|
||||
|
||||
def main():
|
||||
script = os.path.basename(__file__)
|
||||
parser = argparse.ArgumentParser(description='Check a traceback for a known issue')
|
||||
parser.add_argument('-o', "--offline", action='store_true',
|
||||
help="Work offline, don't fetch new data or contact servers")
|
||||
parser.add_argument("--simple", action="store_true",
|
||||
default=(script == "image-naughty"),
|
||||
help="Just print out a known issue number")
|
||||
parser.add_argument("-v", "--verbose", action="store_true",
|
||||
help="Print out more verbose logging info")
|
||||
parser.add_argument('context', help="The image to check against")
|
||||
opts = parser.parse_args()
|
||||
|
||||
base = None
|
||||
if os.environ.get("GITHUB_KNOWN_ISSUE_BASE"):
|
||||
netloc = os.environ.get("GITHUB_API", "https://api.github.com")
|
||||
base = "{0}/repos/{1}/".format(netloc, os.environ.get("GITHUB_KNOWN_ISSUE_BASE"))
|
||||
|
||||
api = None if opts.offline else github.GitHub(base=base)
|
||||
|
||||
context = opts.context
|
||||
if "/" not in context:
|
||||
context = "verify/{0}".format(context)
|
||||
prefix, _, image = context.partition("/")
|
||||
image, _, scenario = image.partition("/")
|
||||
|
||||
try:
|
||||
output = sys.stdin.read()
|
||||
number = None
|
||||
|
||||
if image and output:
|
||||
number = check_known_issue(api, output, image)
|
||||
|
||||
if number and api and api.available:
|
||||
try:
|
||||
post_github(api, number, output, image)
|
||||
except (socket.error, RuntimeError):
|
||||
traceback.print_exc()
|
||||
sys.stderr.write("{0}: posting update to GitHub failed\n".format(script))
|
||||
# Fall through
|
||||
|
||||
# Simple single number output
|
||||
if opts.simple:
|
||||
if number:
|
||||
sys.stdout.write("{0}\n".format(number))
|
||||
return 0
|
||||
|
||||
# Otherwise we filter the output and write it back
|
||||
if number:
|
||||
output = filterSkip(output, "# SKIP Known issue #{0}".format(number))
|
||||
elif checkRetry(output):
|
||||
output = filterRetry(output, "# RETRY due to failure of test harness or framework")
|
||||
elif not opts.offline:
|
||||
output = guessFlake(output, context, opts.verbose)
|
||||
|
||||
sys.stdout.write(output)
|
||||
return 0
|
||||
|
||||
except RuntimeError as ex:
|
||||
sys.stderr.write("{0}: {1}\n".format(script, ex))
|
||||
return 1
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# TAP Parsing
|
||||
|
||||
# Update TAP output failure and put a skip message
|
||||
# in the appropriate place
|
||||
def filterSkip(output, skip):
|
||||
lines = output.split("\n")
|
||||
for i, line in enumerate(lines):
|
||||
if line.startswith("not ok "):
|
||||
lines[i] = line[4:] + " " + skip
|
||||
return "\n".join(lines)
|
||||
|
||||
# Update TAP output failure and put a retry message
|
||||
# in the appropriate place
|
||||
def filterRetry(output, retry):
|
||||
lines = output.split("\n")
|
||||
for i, line in enumerate(lines):
|
||||
if line.startswith("not ok "):
|
||||
lines[i] = line + " " + retry
|
||||
return "\n".join(lines)
|
||||
|
||||
# Figure out the name from a failed test
|
||||
def parseName(output):
|
||||
for line in output.split("\n"):
|
||||
if line.startswith("not ok "):
|
||||
line = line[7:]
|
||||
while line[0].isspace() or line[0].isdigit():
|
||||
line = line[1:]
|
||||
(name, delim, directive) = line.partition("#")
|
||||
(name, delim, directive) = name.partition("duration")
|
||||
name = name.strip()
|
||||
return name
|
||||
return ""
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# Flakiness Checks
|
||||
|
||||
def guessFlake(output, context, verbose=False):
|
||||
# Build up an item just like in tests-data
|
||||
item = {
|
||||
"pull": None,
|
||||
"revision": os.environ.get("TEST_REVISION"),
|
||||
"status": "failure",
|
||||
"context": context,
|
||||
"date": time.strftime("%Y-%m-%dT%H:%M:%SZ"),
|
||||
"merged": None,
|
||||
"test": parseName(output),
|
||||
"url": None,
|
||||
"log": output
|
||||
}
|
||||
|
||||
host = os.environ.get("COCKPIT_LEARN_SERVICE_HOST", "learn-cockpit.apps.ci.centos.org")
|
||||
port = os.environ.get("COCKPIT_LEARN_SERVICE_PORT", "443")
|
||||
response = { }
|
||||
flake = False
|
||||
|
||||
url = "{0}://{1}:{2}/predict".format("https" if port == "443" else "http", host, port)
|
||||
jsonl = json.dumps(item) + "\n"
|
||||
try:
|
||||
data = urllib.request.urlopen(url, data=jsonl.encode('utf-8')).read()
|
||||
except (ConnectionResetError, urllib.error.URLError, socket.gaierror) as ex:
|
||||
sys.stderr.write("{0}: {1}\n".format(url, ex))
|
||||
data = b"{ }"
|
||||
try:
|
||||
response = json.loads(data.decode('utf-8'))
|
||||
except ValueError as ex:
|
||||
sys.stderr.write("{0}: {1}\n".format(url, ex))
|
||||
|
||||
if verbose and response:
|
||||
sys.stderr.write(json.dumps(response, indent=4) + "\n")
|
||||
|
||||
def count(record, field, only):
|
||||
values = record.get(field, [])
|
||||
for value, count in values:
|
||||
if value != only:
|
||||
continue
|
||||
return count
|
||||
return 0
|
||||
|
||||
merged = count(response, "merged", True)
|
||||
not_merged = count(response, "merged", False)
|
||||
null_merged = count(response, "merged", None)
|
||||
|
||||
score = 0
|
||||
total = merged + not_merged + null_merged
|
||||
if total:
|
||||
score = (merged / total)
|
||||
if score > FLAKE_THRESHOLD:
|
||||
output += "\n# Flake likely: {0:.3f} (clustering). Please file issue if this not a flake\n".format(score)
|
||||
flake = True
|
||||
elif merged > 0:
|
||||
output += "\n# Flake probability: {0:.3f} (clustering)\n".format(score)
|
||||
if "trackers" in response:
|
||||
tracker = response["trackers"][0]
|
||||
output += "# Flake tracked {0} likelihood {1:.3f}\n".format(tracker[0], tracker[1])
|
||||
|
||||
if flake:
|
||||
output = filterRetry(output, "# RETRY due to flakiness")
|
||||
return output
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# Retry policy
|
||||
|
||||
def checkRetry(trace):
|
||||
# We check for persistent but test harness or framework specific
|
||||
# failures that otherwise cause flakiness and false positives.
|
||||
#
|
||||
# The things we check here must:
|
||||
# * have no impact on users of Cockpit in the real world
|
||||
# * be things we tried to resolve in other ways. This is a last resort
|
||||
#
|
||||
|
||||
trace = normalize_traceback(trace)
|
||||
|
||||
# HACK: Interacting with sshd during boot is not always predictable
|
||||
# We're using an implementation detail of the server as our "way in" for testing.
|
||||
# This often has to do with sshd being restarted for some reason
|
||||
if "SSH master process exited with code: 255" in trace:
|
||||
return True
|
||||
|
||||
# HACK: Intermittently the new libvirt machine won't get an IP address
|
||||
# or SSH will completely fail to start. We've tried various approaches
|
||||
# to minimize this, but it happens every 100,000 tests or so
|
||||
if "Failure: Unable to reach machine " in trace:
|
||||
return True
|
||||
|
||||
# HACK: For when the verify machine runs out of available processes
|
||||
# We should retry this test process
|
||||
if "self.pid = os.fork()\nOSError: [Errno 11] Resource temporarily unavailable" in trace:
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# Known Issue Matching and Filing
|
||||
|
||||
def normalize_traceback(trace):
|
||||
# All file paths converted to basename
|
||||
trace = re.sub(r'File "[^"]*/([^/"]+)"', 'File "\\1"', trace.strip())
|
||||
|
||||
# replace noise in SELinux violations
|
||||
trace = re.sub('audit\([0-9.:]+\)', 'audit(0)', trace)
|
||||
trace = re.sub(r'\b(pid|ino)=[0-9]+ ', r'\1=0 ', trace)
|
||||
|
||||
# in Python 3, testlib.Error is shown with namespace
|
||||
trace = re.sub('testlib\.Error', 'Error', trace)
|
||||
return trace
|
||||
|
||||
def check_known_issue(api, trace, image):
|
||||
image_naughty = os.path.join(BOTS, "naughty", get_test_image(image))
|
||||
if not os.path.exists(image_naughty):
|
||||
return None
|
||||
|
||||
trace = normalize_traceback(trace)
|
||||
number = None
|
||||
for naughty in os.listdir(image_naughty):
|
||||
(prefix, unused, name) = naughty.partition("-")
|
||||
n = int(prefix)
|
||||
with open(os.path.join(image_naughty, naughty), "r") as fp:
|
||||
match = "*" + normalize_traceback(fp.read()) + "*"
|
||||
# Match as in a file name glob, albeit multi line, and account for literal pastes with '[]'
|
||||
if fnmatch.fnmatchcase(trace, match) or fnmatch.fnmatchcase(trace, match.replace("[", "?")):
|
||||
number = n
|
||||
return number
|
||||
|
||||
|
||||
# Update a known issue thread on GitHub
|
||||
#
|
||||
# The idea is to combine repeated errors into fewer comments by
|
||||
# editing them and keeping all relevant information.
|
||||
#
|
||||
# For this we keep one comment per context (e.g. 'verify/fedora-atomic')
|
||||
# and divide that into sections, one each per error description / trace.
|
||||
# In each section, we keep the error description / trace as well as
|
||||
# the number of recorded events, the first occurrence and the last 10
|
||||
# occurrences.
|
||||
# For each (listed) occurrence we display the timestamp and some details
|
||||
# provided by the caller, such as a revision or link to log files.
|
||||
# The details can't contain newline characters and should be brief
|
||||
def update_known_issue(api, number, err, details, context, timestamp=None):
|
||||
timestamp = timestamp or datetime.datetime.now().isoformat()
|
||||
|
||||
link = timestamp
|
||||
if details:
|
||||
link = "{0} | {1}".format(timestamp, details)
|
||||
|
||||
comments = issue_comments(api, number)
|
||||
|
||||
# try to find an existing comment to update; extract the traceback from the
|
||||
# whole output; also ensure to remove the "# duration: XXs" trailer
|
||||
err_key = normalize_traceback(err).strip()
|
||||
m = re.search("^(Traceback.*^not ok[^#\\n]*)", err_key, re.S | re.M)
|
||||
if m:
|
||||
err_key = m.group(1)
|
||||
comment_key = "{0}\n".format(context)
|
||||
latest_occurrences = "Latest occurrences:\n\n"
|
||||
for comment in reversed(comments):
|
||||
if 'body' in comment and comment['body'].startswith(comment_key):
|
||||
parts = comment['body'].split("<hr>")
|
||||
updated = False
|
||||
for part_idx, part in enumerate(parts):
|
||||
part = normalize_traceback(part).strip()
|
||||
if err_key in part:
|
||||
latest = part.split(latest_occurrences)
|
||||
if len(latest) < 2:
|
||||
sys.stderr.write("Error while parsing latest occurrences\n")
|
||||
else:
|
||||
# number of times this error was recorded
|
||||
header = latest[0].split("\n")
|
||||
for header_idx, entry in enumerate(header):
|
||||
if entry.startswith("Times recorded: "):
|
||||
rec_entries = entry.split(" ")
|
||||
rec_entries[-1] = str(int(rec_entries[-1]) + 1)
|
||||
header[header_idx] = " ".join(rec_entries)
|
||||
latest[0] = "\n".join(header)
|
||||
# list of recent occurrences
|
||||
occurrences = [_f for _f in latest[1].split("\n") if _f]
|
||||
occurrences.append("- {0}\n".format(link))
|
||||
# only keep the last 10
|
||||
if len(occurrences) > 10:
|
||||
occurrences.pop(0)
|
||||
parts[part_idx] = "{0}{1}{2}".format(latest[0], latest_occurrences, "\n".join(occurrences))
|
||||
updated = True
|
||||
break
|
||||
|
||||
if updated:
|
||||
# shuffle the updated part to the end
|
||||
assert len(parts) > part_idx
|
||||
parts.append(parts[part_idx])
|
||||
del parts[part_idx]
|
||||
|
||||
else:
|
||||
# add a new part
|
||||
while len(parts) > 10: # maximum 10 traces
|
||||
parts.pop()
|
||||
|
||||
parts.append("""
|
||||
```
|
||||
{0}
|
||||
```
|
||||
First occurrence: {1}
|
||||
Times recorded: 1
|
||||
{2}- {1}
|
||||
""".format(err.strip(), link, latest_occurrences))
|
||||
updated = True
|
||||
|
||||
# update comment, no need to check others
|
||||
body = "<hr>\n".join(parts)
|
||||
|
||||
# ensure that the body is not longer than 64k.
|
||||
# drop earlier parts if we need to.
|
||||
while len(body) >= 65536:
|
||||
parts.pop(1) # parts[0] is the header
|
||||
|
||||
body = "<hr>\n".join(parts)
|
||||
|
||||
return api.patch("issues/comments/{0}".format(comment['id']), { "body": body })
|
||||
|
||||
# create a new comment, since we didn't find one to update
|
||||
|
||||
data = { "body": """{0}\nOoops, it happened again<hr>
|
||||
```
|
||||
{1}
|
||||
```
|
||||
First occurrence: {2}
|
||||
Times recorded: 1
|
||||
{3}- {2}
|
||||
""".format(context, err.strip(), link, latest_occurrences) }
|
||||
return api.post("issues/{0}/comments".format(number), data)
|
||||
|
||||
def issue_comments(api, number):
|
||||
result = [ ]
|
||||
page = 1
|
||||
count = 100
|
||||
while count == 100:
|
||||
comments = api.get("issues/{0}/comments?page={1}&per_page={2}".format(number, page, count))
|
||||
count = 0
|
||||
page += 1
|
||||
if comments:
|
||||
result += comments
|
||||
count = len(comments)
|
||||
return result
|
||||
|
||||
|
||||
def post_github(api, number, output, image):
|
||||
|
||||
# Ignore this if we were not given a token
|
||||
if not api or not api.available:
|
||||
return
|
||||
|
||||
context = "verify/{0}".format(image)
|
||||
|
||||
# Lookup the link being logged to
|
||||
link = None
|
||||
revision = os.environ.get("TEST_REVISION", None)
|
||||
if revision:
|
||||
link = "revision {0}".format(revision)
|
||||
statuses = api.get("commits/{0}/statuses".format(revision))
|
||||
if statuses:
|
||||
for status in statuses:
|
||||
if status["context"] == context:
|
||||
link = "revision {0}, [logs]({1})".format(revision, status["target_url"])
|
||||
break
|
||||
update_known_issue(api, number, output, link, context)
|
||||
|
||||
if __name__ == '__main__':
|
||||
sys.exit(main())
|
||||
Loading…
Add table
Add a link
Reference in a new issue