starter-kit/bots/tests-data
2019-08-22 16:05:13 +00:00

481 lines
17 KiB
Python
Executable file

#!/usr/bin/env python3
# This file is part of Cockpit.
#
# Copyright (C) 2017 Red Hat, Inc.
#
# Cockpit is free software; you can redistribute it and/or modify it
# under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation; either version 2.1 of the License, or
# (at your option) any later version.
#
# Cockpit is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with Cockpit; If not, see <http://www.gnu.org/licenses/>.
import gzip
import json
import os
import re
import socket
import ssl
import subprocess
import sys
import tempfile
import time
import urllib.parse
import urllib.request, urllib.error, urllib.parse
import zlib
import html.parser
sys.dont_write_bytecode = True
import task
from machine import testvm
# The number of days of previous closed pull requests to learn from
SINCE_DAYS = 120
BOTS = os.path.abspath(os.path.dirname(__file__))
SEEDED = set()
SINKS = { }
def run(filename, verbose=False, dry=False, **kwargs):
since = time.time() - 60 * 60 * 24 * SINCE_DAYS
pulls = Pulls(since)
# Seed with our input data
if filename:
if "/" not in filename and not os.path.exists(filename):
if not dry:
subprocess.check_call([ os.path.join(BOTS, "image-download"), "--state", filename ])
filename = os.path.join(testvm.get_images_data_dir(), filename)
(outfd, outname) = tempfile.mkstemp(prefix=os.path.basename(filename), dir=os.path.dirname(filename))
os.close(outfd)
output = gzip.open(outname, 'wb')
if os.path.exists(filename):
with gzip.open(filename, 'rb') as fp:
seed(since, fp, pulls, output)
else:
output = sys.stdout.buffer
outname = None
def write(**kwargs):
line = json.dumps(kwargs).encode('utf-8') + b"\n"
output.write(line)
# Iterate through all revisions, pull requests on this branch
for (commit, merged, created, pull) in commits("master", pulls, since, verbose):
logged = False
if verbose:
sys.stderr.write("- {0}\n".format(commit))
for (context, created, url, log) in logs(commit):
if verbose:
sys.stderr.write(" - {0} {1}\n".format(created, context))
for (status, name, body, tracker) in tap(log):
write(pull=pull, revision=commit, status=status,
context=context, date=created, merged=merged,
test=name, url=url, tracker=tracker, log=body)
logged = True
# Nothing found for this log
if not logged:
write(pull=pull, revision=commit, status="unknown", date=created,
merged=merged, url=url, log=log)
logged = True
# Nothing found for this revision
if not logged:
write(pull=pull, revision=commit, status="unknown", date=created, merged=merged)
logged = True
sys.stdout.flush()
if output:
output.close()
if outname:
os.rename(outname, filename)
if not dry and outname and filename:
upload = [ os.path.join(BOTS, "image-upload"), "--state", filename ]
subprocess.check_call(upload)
# An HTML parser that just pulls out all the <a href="...">
# link hrefs in a given page of content. We also qualify these
# hrefs with a base url, in case they're relative
class HrefParser(html.parser.HTMLParser):
def __init__(self, base, hrefs):
html.parser.HTMLParser.__init__(self)
self.hrefs = hrefs
self.base = base
def handle_starttag(self, tag, attrs):
if tag.lower() == "a":
for (name, value) in attrs:
if name.lower() == "href":
url = urllib.parse.urljoin(self.base, value)
# print 'HREF', url
self.hrefs.append(url)
# Check if a given pull request was included in its base
# branch via merging or otherwise
class Pulls():
def __init__(self, since):
self.fetched = { }
self.checked = { }
self.pulls = { }
self.listing = [ ]
self.since = since
# Get all the pull requests since a given time
def __iter__(self):
if self.listing:
iterate = self.pulls.values()
else:
iterate = task.api.pulls(state="all", since=self.since)
listing = [ ]
for pull in iterate:
self.pulls[pull["number"]] = pull
listing.append(pull)
yield pull
self.listing = listing
# Turn a stning/int pull number into an pull object
def normalize(self, pull):
if isinstance(pull, int):
pull = str(pull)
if isinstance(pull, str):
if "/" not in pull:
pull = qualify("pulls/{0}".format(pull))
if pull in self.pulls:
pull = self.pulls[pull]
else:
pull = task.api.get(pull)
self.pulls[pull["url"]] = pull
elif not isinstance(pull, dict):
raise ValueError("Invalid pull request: {0}".format(repr(pull)))
return pull
def merged(self, pull):
pull = self.normalize(pull)
# if not pull:
# return None
number = pull["number"]
if number in self.checked:
return self.checked[number]
if pull.get("state") != "closed":
return None
# GitHub is telling us this was merged
if pull.get("merged"):
return True
# Fetch git data about this branch
cwd = os.path.dirname(__file__)
base = pull["base"]["ref"]
if base not in self.fetched:
try:
subprocess.check_call([ "git", "fetch", "-q", "--", "origin", base ], cwd=cwd)
except subprocess.CalledProcessError:
return None # error already printed by process
self.fetched[base] = base
# Look for git commits up until a year before the pull request
when = time.mktime(time.strptime(pull["created_at"], "%Y-%m-%dT%H:%M:%SZ"))
when -= 60 * 60 * 24 * 365
since = time.strftime("%Y-%m-%d", time.gmtime(when))
# Check if it's referred to in this branch
match = "(Closes|Fixes|closes|fixes).*{0}".format(number)
cmd = [
"git", "log", "--extended-regexp", "--grep", match,
"--since=" + since, "origin/" + base
]
output = subprocess.check_output(cmd, cwd=cwd)
self.checked[number] = output and True or False
return self.checked[number]
# Retrieves the content of the given URL
def retrieve(url):
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
req = urllib.request.urlopen(url, context=ctx)
return req.read().decode('utf-8', 'replace')
# Returns a list of all results at the given URL
def links(url):
result = [ ]
parser = HrefParser(url, result)
try:
parser.feed(retrieve(url))
except urllib.error.HTTPError as ex:
if ex.code != 404:
raise
except (ConnectionResetError, urllib.error.URLError, socket.gaierror) as ex:
sys.stderr.write("{0}: {1}\n".format(url, ex))
return result
# Parses seed input data and passes it through to output
# all the while preparing the fact that certain URLs have
# already been seen
def seed(since, fp, pulls, output):
seeded = None
known = re.compile("# SKIP Known issue #([0-9]+)", re.IGNORECASE)
while True:
try:
line = fp.readline()
except (OSError, zlib.error) as ex:
sys.stderr.write("tests-data: {0}\n".format(str(ex)))
break
if not line:
break
try:
item = json.loads(line.decode('utf-8'))
except ValueError as ex:
sys.stderr.write("tests-data: {0}\n".format(str(ex)))
continue
# Once we see a new pull treat the old one as complete and seeded
# As a failsafe, just to make sure we didn't miss something
# wo don't treat the last pull request as completely seeded
pull = item.get("pull")
if pull and pull != seeded:
SEEDED.add(seeded)
seeded = None
if pull and item.get("merged") not in [ True, False ]:
item["merged"] = pulls.merged(pull)
# Note that we've already retrieved this URL
url = item.get("url")
if url and item.get("log") is not None:
SEEDED.add(url)
SEEDED.add(urllib.parse.urljoin(url, "./"))
# If the pull request had a known merged value it can be seeded
# This forces us to retrieve data about open pull requests again
if item["merged"] in [ True, False ]:
seeded = pull
SEEDED.add(item["revision"])
date = item.get("date")
if not date or since > time.mktime(time.strptime(date, "%Y-%m-%dT%H:%M:%SZ")):
continue
# COMPAT: Fix data that wasn't yet valid
if item["status"] == "skip":
match = known.search(item["log"])
if match:
item["status"] = "failure"
item["tracker"] = qualify("issues/{0}".format(match.group(1)))
line = json.dumps(item).encode('utf-8') + b"\n"
output.write(line)
# Generate a list of (revision, merged, url) for the given branch
# This includes pull requests targeting the branch in question
#
# revision: the SHA of a commit
# merged: True/False/None whether merged or not
# url: The URL for the pull request or None
def commits(branch, pulls, since, verbose=False):
if verbose:
sys.stderr.write("{0}\n".format(branch))
# Iterate through commits on master
for commit in task.api.commits(branch, since=since):
revision = commit["sha"].lower()
if revision not in SEEDED:
yield revision, True, commit["commit"]["committer"]["date"], None
# Iterate through pull requests
for pull in pulls:
if pull["number"] in SEEDED:
continue
if pull["base"]["ref"] != branch:
continue
if verbose:
sys.stderr.write("pull-{0}\n".format(pull["number"]))
merged = pulls.merged(pull)
for revision in revisions(pull):
yield revision, merged, pull["created_at"], pull["url"]
# The next revisions for the pull request are not the ones
# that got merged. Only the first one produced by revisions
if merged:
merged = False
# Get all the revisions in a pull request. GitHub doesn't help
# us here so we have to use silly tricks
def revisions(pull):
head = pull.get("head", { }).get("sha")
if not head:
return
# First give back the main pull request
head = head.lower()
yield head
# All the revisions we've seen
seen = set([ head ])
# Seed the set of sinks. We use these sinks to figure out additional
# revisions for the pull request. Unfortunately GitHub doesn't help us
# with a list of revisions that this pull request used to reflect. So
# we have to look to our sink for that info.
data = task.api.get("commits/{0}/status?page=1&per_page=100".format(head))
for status in data.get("statuses", [ ]):
url = status["target_url"]
if url:
SEEDED.add(urllib.parse.urljoin(url, "./"))
sink = urllib.parse.urljoin(url, "../")
if sink not in SINKS:
SINKS[sink] = links(sink)
# Now ask each sink for its set of urls
name = "pull-{0}".format(pull["number"])
for sink in SINKS:
for link in SINKS[sink]:
# We only care about stuff at the sink where pull-XXXX is in
# the URL. This is how we figure out whether things are related
if name not in link:
continue
# Already retrieved this one
if link in SEEDED:
continue
# Build a URL for the cockpituous sink /status file and read it
target = urllib.parse.urljoin(link, "status")
try:
data = json.loads(retrieve(target))
except (ValueError, ConnectionError) as ex:
sys.stderr.write("{0}: {1}\n".format(target, ex))
except urllib.error.HTTPError as ex:
if ex.code != 404:
raise
except urllib.error.URLError as ex:
sys.stderr.write("{0}: {1}\n".format(target, ex))
pass
else:
# The status file contains a "revision" field which is the git revision
# of what was tested during that test run. This is what we're after
if "revision" in data:
revision = data["revision"].lower()
if revision not in seen:
seen.add(revision)
yield revision
# Pull out all status (context, created, log) for a given revision. This includes multiple
# test runs for a given revision, and all the various status contexts
def logs(revision):
page = 1
count = 100
while count == 100:
data = task.api.get("commits/{0}/status?page={1}&per_page={2}".format(revision, page, count))
count = 0
for status in data.get("statuses", [ ]):
count += 1
# Make sure to not consider "state": "success" as a success
# here because individual tests may have failed, or been retried.
#
# Always only consider tests individually to have run or failed
# not entire test suite statuses
if status["state"] in [ "pending" ]:
continue
target = status.get("target_url")
if not target:
continue
if target.endswith(".html"):
target = target[:-5]
if target in SEEDED:
continue
log = None
try:
log = retrieve(target)
except urllib.error.HTTPError as ex:
if ex.code != 404:
raise
log = ""
except (ConnectionResetError, urllib.error.URLError, socket.gaierror) as ex:
sys.stderr.write("{0}: {1}\n".format(target, ex))
if log is not None:
yield (status["context"], status["created_at"], target, log)
# Generate (status, name, body, tracker) for each Test Anything Protocol test
# in the content.
#
# status: possible values "success", "failure", "skip"
# name: the name of the test
# body: full log of the test
# tracker: url tracking the failure, or None
def tap(content):
name = status = tracker = None
prefix = None
body = [ ]
blocks = False
for line in content.split('\n'):
# The test intro, everything before here is fluff
if not prefix and line.startswith("1.."):
prefix = line
body = [ ]
name = status = tracker = None
# A TAP test status line
elif line.startswith("ok ") or line.startswith("not ok "):
body.append(line)
# Parse out the status
if line.startswith("not ok "):
status = "failure"
line = line[7:]
else:
line = line[3:]
if "# SKIP KNOWN ISSUE" in line.upper():
status = "failure"
(unused, delim, issue) = line.partition("#")
tracker = qualify("issues/{0}".format(issue))
if "# SKIP" in line.upper():
status = "skip"
else:
status = "success"
# Parse out the name
while line[0].isspace() or line[0].isdigit():
line = line[1:]
(name, delim, directive) = line.partition("#")
(name, delim, directive) = name.partition("duration")
name = name.strip()
# Old Cockpit tests had strange blocks
if not blocks:
yield (status, name, "\n".join(body), tracker)
status = name = tracker = None
body = [ ]
else:
# Old Cockpit tests didn't separate bound their stuff properly
if line.startswith("# --------------------"):
blocks = True
if status:
yield (status, name, "\n".join(body), tracker)
name = status = tracker = None
body = [ ]
body.append(line)
# Qualify a URL into the GitHub repository
def qualify(path):
return "https://api.github.com" + task.api.qualify(path)
if __name__ == '__main__':
task.main(function=run, title="Pull out test data for pull requests", verbose=True)