#!/usr/bin/env python3 # This file is part of Cockpit. # # Copyright (C) 2017 Red Hat, Inc. # # Cockpit is free software; you can redistribute it and/or modify it # under the terms of the GNU Lesser General Public License as published by # the Free Software Foundation; either version 2.1 of the License, or # (at your option) any later version. # # Cockpit is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public License # along with Cockpit; If not, see . import gzip import json import os import re import socket import ssl import subprocess import sys import tempfile import time import urllib.parse import urllib.request, urllib.error, urllib.parse import zlib import html.parser sys.dont_write_bytecode = True import task from machine import testvm # The number of days of previous closed pull requests to learn from SINCE_DAYS = 120 BOTS = os.path.abspath(os.path.dirname(__file__)) SEEDED = set() SINKS = { } def run(filename, verbose=False, dry=False, **kwargs): since = time.time() - 60 * 60 * 24 * SINCE_DAYS pulls = Pulls(since) # Seed with our input data if filename: if "/" not in filename and not os.path.exists(filename): if not dry: subprocess.check_call([ os.path.join(BOTS, "image-download"), "--state", filename ]) filename = os.path.join(testvm.get_images_data_dir(), filename) (outfd, outname) = tempfile.mkstemp(prefix=os.path.basename(filename), dir=os.path.dirname(filename)) os.close(outfd) output = gzip.open(outname, 'wb') if os.path.exists(filename): with gzip.open(filename, 'rb') as fp: seed(since, fp, pulls, output) else: output = sys.stdout.buffer outname = None def write(**kwargs): line = json.dumps(kwargs).encode('utf-8') + b"\n" output.write(line) # Iterate through all revisions, pull requests on this branch for (commit, merged, created, pull) in commits("master", pulls, since, verbose): logged = False if verbose: sys.stderr.write("- {0}\n".format(commit)) for (context, created, url, log) in logs(commit): if verbose: sys.stderr.write(" - {0} {1}\n".format(created, context)) for (status, name, body, tracker) in tap(log): write(pull=pull, revision=commit, status=status, context=context, date=created, merged=merged, test=name, url=url, tracker=tracker, log=body) logged = True # Nothing found for this log if not logged: write(pull=pull, revision=commit, status="unknown", date=created, merged=merged, url=url, log=log) logged = True # Nothing found for this revision if not logged: write(pull=pull, revision=commit, status="unknown", date=created, merged=merged) logged = True sys.stdout.flush() if output: output.close() if outname: os.rename(outname, filename) if not dry and outname and filename: upload = [ os.path.join(BOTS, "image-upload"), "--state", filename ] subprocess.check_call(upload) # An HTML parser that just pulls out all the # link hrefs in a given page of content. We also qualify these # hrefs with a base url, in case they're relative class HrefParser(html.parser.HTMLParser): def __init__(self, base, hrefs): html.parser.HTMLParser.__init__(self) self.hrefs = hrefs self.base = base def handle_starttag(self, tag, attrs): if tag.lower() == "a": for (name, value) in attrs: if name.lower() == "href": url = urllib.parse.urljoin(self.base, value) # print 'HREF', url self.hrefs.append(url) # Check if a given pull request was included in its base # branch via merging or otherwise class Pulls(): def __init__(self, since): self.fetched = { } self.checked = { } self.pulls = { } self.listing = [ ] self.since = since # Get all the pull requests since a given time def __iter__(self): if self.listing: iterate = self.pulls.values() else: iterate = task.api.pulls(state="all", since=self.since) listing = [ ] for pull in iterate: self.pulls[pull["number"]] = pull listing.append(pull) yield pull self.listing = listing # Turn a stning/int pull number into an pull object def normalize(self, pull): if isinstance(pull, int): pull = str(pull) if isinstance(pull, str): if "/" not in pull: pull = qualify("pulls/{0}".format(pull)) if pull in self.pulls: pull = self.pulls[pull] else: pull = task.api.get(pull) self.pulls[pull["url"]] = pull elif not isinstance(pull, dict): raise ValueError("Invalid pull request: {0}".format(repr(pull))) return pull def merged(self, pull): pull = self.normalize(pull) # if not pull: # return None number = pull["number"] if number in self.checked: return self.checked[number] if pull.get("state") != "closed": return None # GitHub is telling us this was merged if pull.get("merged"): return True # Fetch git data about this branch cwd = os.path.dirname(__file__) base = pull["base"]["ref"] if base not in self.fetched: try: subprocess.check_call([ "git", "fetch", "-q", "--", "origin", base ], cwd=cwd) except subprocess.CalledProcessError: return None # error already printed by process self.fetched[base] = base # Look for git commits up until a year before the pull request when = time.mktime(time.strptime(pull["created_at"], "%Y-%m-%dT%H:%M:%SZ")) when -= 60 * 60 * 24 * 365 since = time.strftime("%Y-%m-%d", time.gmtime(when)) # Check if it's referred to in this branch match = "(Closes|Fixes|closes|fixes).*{0}".format(number) cmd = [ "git", "log", "--extended-regexp", "--grep", match, "--since=" + since, "origin/" + base ] output = subprocess.check_output(cmd, cwd=cwd) self.checked[number] = output and True or False return self.checked[number] # Retrieves the content of the given URL def retrieve(url): ctx = ssl.create_default_context() ctx.check_hostname = False ctx.verify_mode = ssl.CERT_NONE req = urllib.request.urlopen(url, context=ctx) return req.read().decode('utf-8', 'replace') # Returns a list of all results at the given URL def links(url): result = [ ] parser = HrefParser(url, result) try: parser.feed(retrieve(url)) except urllib.error.HTTPError as ex: if ex.code != 404: raise except (ConnectionResetError, urllib.error.URLError, socket.gaierror) as ex: sys.stderr.write("{0}: {1}\n".format(url, ex)) return result # Parses seed input data and passes it through to output # all the while preparing the fact that certain URLs have # already been seen def seed(since, fp, pulls, output): seeded = None known = re.compile("# SKIP Known issue #([0-9]+)", re.IGNORECASE) while True: try: line = fp.readline() except (OSError, zlib.error) as ex: sys.stderr.write("tests-data: {0}\n".format(str(ex))) break if not line: break try: item = json.loads(line.decode('utf-8')) except ValueError as ex: sys.stderr.write("tests-data: {0}\n".format(str(ex))) continue # Once we see a new pull treat the old one as complete and seeded # As a failsafe, just to make sure we didn't miss something # wo don't treat the last pull request as completely seeded pull = item.get("pull") if pull and pull != seeded: SEEDED.add(seeded) seeded = None if pull and item.get("merged") not in [ True, False ]: item["merged"] = pulls.merged(pull) # Note that we've already retrieved this URL url = item.get("url") if url and item.get("log") is not None: SEEDED.add(url) SEEDED.add(urllib.parse.urljoin(url, "./")) # If the pull request had a known merged value it can be seeded # This forces us to retrieve data about open pull requests again if item["merged"] in [ True, False ]: seeded = pull SEEDED.add(item["revision"]) date = item.get("date") if not date or since > time.mktime(time.strptime(date, "%Y-%m-%dT%H:%M:%SZ")): continue # COMPAT: Fix data that wasn't yet valid if item["status"] == "skip": match = known.search(item["log"]) if match: item["status"] = "failure" item["tracker"] = qualify("issues/{0}".format(match.group(1))) line = json.dumps(item).encode('utf-8') + b"\n" output.write(line) # Generate a list of (revision, merged, url) for the given branch # This includes pull requests targeting the branch in question # # revision: the SHA of a commit # merged: True/False/None whether merged or not # url: The URL for the pull request or None def commits(branch, pulls, since, verbose=False): if verbose: sys.stderr.write("{0}\n".format(branch)) # Iterate through commits on master for commit in task.api.commits(branch, since=since): revision = commit["sha"].lower() if revision not in SEEDED: yield revision, True, commit["commit"]["committer"]["date"], None # Iterate through pull requests for pull in pulls: if pull["number"] in SEEDED: continue if pull["base"]["ref"] != branch: continue if verbose: sys.stderr.write("pull-{0}\n".format(pull["number"])) merged = pulls.merged(pull) for revision in revisions(pull): yield revision, merged, pull["created_at"], pull["url"] # The next revisions for the pull request are not the ones # that got merged. Only the first one produced by revisions if merged: merged = False # Get all the revisions in a pull request. GitHub doesn't help # us here so we have to use silly tricks def revisions(pull): head = pull.get("head", { }).get("sha") if not head: return # First give back the main pull request head = head.lower() yield head # All the revisions we've seen seen = set([ head ]) # Seed the set of sinks. We use these sinks to figure out additional # revisions for the pull request. Unfortunately GitHub doesn't help us # with a list of revisions that this pull request used to reflect. So # we have to look to our sink for that info. data = task.api.get("commits/{0}/status?page=1&per_page=100".format(head)) for status in data.get("statuses", [ ]): url = status["target_url"] if url: SEEDED.add(urllib.parse.urljoin(url, "./")) sink = urllib.parse.urljoin(url, "../") if sink not in SINKS: SINKS[sink] = links(sink) # Now ask each sink for its set of urls name = "pull-{0}".format(pull["number"]) for sink in SINKS: for link in SINKS[sink]: # We only care about stuff at the sink where pull-XXXX is in # the URL. This is how we figure out whether things are related if name not in link: continue # Already retrieved this one if link in SEEDED: continue # Build a URL for the cockpituous sink /status file and read it target = urllib.parse.urljoin(link, "status") try: data = json.loads(retrieve(target)) except (ValueError, ConnectionError) as ex: sys.stderr.write("{0}: {1}\n".format(target, ex)) except urllib.error.HTTPError as ex: if ex.code != 404: raise except urllib.error.URLError as ex: sys.stderr.write("{0}: {1}\n".format(target, ex)) pass else: # The status file contains a "revision" field which is the git revision # of what was tested during that test run. This is what we're after if "revision" in data: revision = data["revision"].lower() if revision not in seen: seen.add(revision) yield revision # Pull out all status (context, created, log) for a given revision. This includes multiple # test runs for a given revision, and all the various status contexts def logs(revision): page = 1 count = 100 while count == 100: data = task.api.get("commits/{0}/status?page={1}&per_page={2}".format(revision, page, count)) count = 0 for status in data.get("statuses", [ ]): count += 1 # Make sure to not consider "state": "success" as a success # here because individual tests may have failed, or been retried. # # Always only consider tests individually to have run or failed # not entire test suite statuses if status["state"] in [ "pending" ]: continue target = status.get("target_url") if not target: continue if target.endswith(".html"): target = target[:-5] if target in SEEDED: continue log = None try: log = retrieve(target) except urllib.error.HTTPError as ex: if ex.code != 404: raise log = "" except (ConnectionResetError, urllib.error.URLError, socket.gaierror) as ex: sys.stderr.write("{0}: {1}\n".format(target, ex)) if log is not None: yield (status["context"], status["created_at"], target, log) # Generate (status, name, body, tracker) for each Test Anything Protocol test # in the content. # # status: possible values "success", "failure", "skip" # name: the name of the test # body: full log of the test # tracker: url tracking the failure, or None def tap(content): name = status = tracker = None prefix = None body = [ ] blocks = False for line in content.split('\n'): # The test intro, everything before here is fluff if not prefix and line.startswith("1.."): prefix = line body = [ ] name = status = tracker = None # A TAP test status line elif line.startswith("ok ") or line.startswith("not ok "): body.append(line) # Parse out the status if line.startswith("not ok "): status = "failure" line = line[7:] else: line = line[3:] if "# SKIP KNOWN ISSUE" in line.upper(): status = "failure" (unused, delim, issue) = line.partition("#") tracker = qualify("issues/{0}".format(issue)) if "# SKIP" in line.upper(): status = "skip" else: status = "success" # Parse out the name while line[0].isspace() or line[0].isdigit(): line = line[1:] (name, delim, directive) = line.partition("#") (name, delim, directive) = name.partition("duration") name = name.strip() # Old Cockpit tests had strange blocks if not blocks: yield (status, name, "\n".join(body), tracker) status = name = tracker = None body = [ ] else: # Old Cockpit tests didn't separate bound their stuff properly if line.startswith("# --------------------"): blocks = True if status: yield (status, name, "\n".join(body), tracker) name = status = tracker = None body = [ ] body.append(line) # Qualify a URL into the GitHub repository def qualify(path): return "https://api.github.com" + task.api.qualify(path) if __name__ == '__main__': task.main(function=run, title="Pull out test data for pull requests", verbose=True)