pypi-mirror/jeepyb/cmd/run_mirror.py

374 lines
16 KiB
Python

#! /usr/bin/env python
# Copyright (C) 2011 OpenStack, LLC.
# Copyright (C) 2013 Hewlett-Packard Development Company, L.P.
# Copyright (C) 2013 OpenStack Foundation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
# run_mirror reads a YAML config file like:
# cache-root: /tmp/cache
#
# mirrors:
# - name: openstack
# projects:
# - https://github.com/openstack/requirements
# output: /tmp/mirror/openstack
#
# - name: openstack-infra
# projects:
# - https://github.com/openstack-infra/config
# output: /tmp/mirror/openstack-infra
#
# The algorithm it attempts to follow is:
#
# for each project:
# clone if necessary and fetch origin
# for each project-branch:
# create new virtualenv
# pip install reqs into virtualenv
# if installation succeeds:
# pip freeze > full-reqs
# create new virtualenv
# pip install (download only) full-reqs into virtualenv
#
# By default only summary information is printed on stdout (see the
# -d command line option to get more debug info).
#
# If "pip install" for a branch's requirements fails to complete
# (based on parsing of its output), that output will be copied to
# stderr and the script will skip ahead to the next branch. This
# makes it suitable for running in a cron job with only stdout
# redirected to a log, and also avoids one broken project preventing
# caching of requirements for others.
from __future__ import print_function
import argparse
import datetime
import md5
import os
import pkginfo
import re
import shlex
import shutil
import subprocess
import sys
import tempfile
import urllib
import yaml
class Mirror(object):
def __init__(self):
parser = argparse.ArgumentParser(
description='Build a pypi mirror from requirements')
parser.add_argument('-b', dest='branch',
help='restrict run to a specified branch')
parser.add_argument('-c', dest='config',
help='specify the config file')
parser.add_argument('-n', dest='noop', action='store_true',
help='do not run any commands')
parser.add_argument('--no-pip', dest='no_pip', action='store_true',
help='do not run any pip commands')
parser.add_argument('--verbose', dest='debug', action='store_true',
help='output verbose debug information')
parser.add_argument('--no-download', dest='no_download',
action='store_true',
help='only process the pip cache into a mirror '
'(do not download)')
parser.add_argument('--no-process', dest='no_process',
action='store_true',
help='only download into the pip cache '
'(do not process the cache into a mirror)')
parser.add_argument('--no-update', dest='no_update',
action='store_true',
help='do not update any git repos')
self.args = parser.parse_args()
self.config = yaml.load(open(self.args.config))
def run_command(self, cmd):
cmd_list = shlex.split(str(cmd))
self.debug("Run: %s" % cmd)
if self.args.noop:
return ''
if self.args.no_pip and cmd_list[0].endswith('pip'):
return ''
p = subprocess.Popen(cmd_list, stdout=subprocess.PIPE,
stderr=subprocess.STDOUT)
(out, nothing) = p.communicate()
out = out.strip()
self.debug(out)
return out
def run(self):
for mirror in self.config['mirrors']:
if not self.args.no_download:
self.build_mirror(mirror)
if not self.args.no_process:
self.process_cache(mirror)
def chdir(self, dest):
self.debug("cd %s" % dest)
if not self.args.noop:
os.chdir(dest)
def debug(self, msg):
if self.args.debug:
print(msg)
def process_http_requirements(self, reqlist, pip_cache_dir, pip):
new_reqs = []
for reqfile in reqlist:
for req in open(reqfile):
req = req.strip()
# Handle http://, https://, and git+https?://
if not re.search('https?://', req):
new_reqs.append(req)
continue
target_url = req.split('#', 1)[0]
target_file = os.path.join(pip_cache_dir,
urllib.quote(target_url, ''))
if os.path.exists(target_file):
self.debug("Unlink: %s" % target_file)
os.unlink(target_file)
if os.path.exists(target_file + '.content-type'):
self.debug("Unlink: %s.content-type" % target_file)
os.unlink(target_file + '.content-type')
return new_reqs
def find_pkg_info(self, path):
versions = set()
for root, dirs, files in os.walk(path):
if not root.endswith('.egg'):
continue
if not os.path.exists(os.path.join(root, 'EGG-INFO', 'PKG-INFO')):
continue
package = pkginfo.Develop(root)
versions.add('%s==%s' % (package.name, package.version))
return versions
def build_mirror(self, mirror):
print("Building mirror: %s" % mirror['name'])
pip_format = ("%s install -U %s --exists-action=w "
"--download-cache=%s --build %s -r %s")
venv_format = ("virtualenv --clear --extra-search-dir=%s %s")
upgrade_format = ("%s install -U --exists-action=w "
"--download-cache=%s --build %s %s")
workdir = tempfile.mkdtemp()
reqs = os.path.join(workdir, "reqs")
venv = os.path.join(workdir, "venv")
build = os.path.join(workdir, "build")
pip = os.path.join(venv, "bin", "pip")
project_cache_dir = os.path.join(self.config['cache-root'],
'projects')
pip_cache_dir = os.path.join(self.config['cache-root'],
'pip', mirror['name'])
if not self.args.noop:
if not os.path.exists(project_cache_dir):
os.makedirs(project_cache_dir)
if not os.path.exists(pip_cache_dir):
os.makedirs(pip_cache_dir)
for project in mirror['projects']:
print("Updating repository: %s" % project)
self.chdir(project_cache_dir)
short_project = project.split('/')[-1]
if short_project.endswith('.git'):
short_project = short_project[:-4]
if not os.path.isdir(short_project):
out = self.run_command("git clone %s %s" %
(project, short_project))
self.chdir(os.path.join(project_cache_dir,
short_project))
out = self.run_command("git fetch -p origin")
if self.args.branch:
branches = [self.args.branch]
else:
branches = self.run_command("git branch -a").split("\n")
for branch in branches:
branch = branch.strip()
if (not branch.startswith("remotes/origin")
or "origin/HEAD" in branch):
continue
print("Fetching pip requires for %s:%s" %
(project, branch))
if not self.args.no_update:
out = self.run_command("git reset --hard %s" % branch)
out = self.run_command("git clean -x -f -d -q")
reqlist = []
if os.path.exists('global-requirements.txt'):
reqlist.append('global-requirements.txt')
else:
for requires_file in ("requirements.txt",
"test-requirements.txt",
"tools/pip-requires",
"tools/test-requires"):
if os.path.exists(requires_file):
reqlist.append(requires_file)
if reqlist:
out = self.run_command(venv_format %
(pip_cache_dir, venv))
out = self.run_command(upgrade_format %
(pip, pip_cache_dir,
build, "setuptools"))
out = self.run_command(upgrade_format %
(pip, pip_cache_dir,
build, "pip"))
out = self.run_command(upgrade_format %
(pip, pip_cache_dir,
build, "virtualenv"))
if os.path.exists(build):
shutil.rmtree(build)
new_reqs = self.process_http_requirements(reqlist,
pip_cache_dir,
pip)
(reqfp, reqfn) = tempfile.mkstemp()
os.write(reqfp, '\n'.join(new_reqs))
os.close(reqfp)
out = self.run_command(pip_format %
(pip, "", pip_cache_dir,
build, reqfn))
if "\nSuccessfully installed " not in out:
sys.stderr.write("Installing pip requires for %s:%s "
"failed.\n%s\n" %
(project, branch, out))
print("pip install did not indicate success")
else:
freeze = self.run_command("%s freeze -l" % pip)
requires = self.find_pkg_info(build)
reqfd = open(reqs, "w")
for line in freeze.split("\n"):
if line.startswith("-e ") or (
"==" in line and " " not in line):
requires.add(line)
for r in requires:
reqfd.write(r + "\n")
reqfd.close()
out = self.run_command(venv_format %
(pip_cache_dir, venv))
if os.path.exists(build):
shutil.rmtree(build)
out = self.run_command(pip_format %
(pip, "--no-install",
pip_cache_dir, build, reqs))
if "\nSuccessfully downloaded " not in out:
sys.stderr.write("Downloading pip requires for "
"%s:%s failed.\n%s\n" %
(project, branch, out))
print("pip install did not indicate success")
print("cached:\n%s" % freeze)
else:
print("no requirements")
shutil.rmtree(workdir)
def process_cache(self, mirror):
if self.args.noop:
return
pip_cache_dir = os.path.join(self.config['cache-root'],
'pip', mirror['name'])
destination_mirror = mirror['output']
PACKAGE_VERSION_RE = re.compile(r'(.*)-[0-9]')
full_html_line = "<a href='{dir}/{name}'>{name}</a><br />\n"
packages = {}
package_count = 0
if not os.path.exists(destination_mirror):
os.makedirs(destination_mirror)
for filename in os.listdir(pip_cache_dir):
if filename.endswith('content-type'):
continue
realname = urllib.unquote(filename)
# The ? accounts for sourceforge downloads
tarball = os.path.basename(realname).split("?")[0]
name_match = PACKAGE_VERSION_RE.search(tarball)
if name_match is None:
continue
package_name = name_match.group(1)
version_list = packages.get(package_name, {})
version_list[tarball] = filename
packages[package_name] = version_list
package_count = package_count + 1
full_html = open(os.path.join(destination_mirror, ".full.html"), 'w')
simple_html = open(os.path.join(destination_mirror, ".index.html"),
'w')
header = ("<html><head><title>PyPI Mirror</title></head>"
"<body><h1>PyPI Mirror</h1><h2>Last update: %s</h2>\n\n"
% datetime.datetime.utcnow().strftime("%c UTC"))
full_html.write(header)
simple_html.write(header)
for package_name, versions in packages.items():
destination_dir = os.path.join(destination_mirror, package_name)
if not os.path.isdir(destination_dir):
os.makedirs(destination_dir)
safe_dir = urllib.quote(package_name)
simple_html.write("<a href='%s'>%s</a><br />\n" %
(safe_dir, safe_dir))
with open(os.path.join(destination_dir, ".index.html"),
'w') as index:
index.write("""<html><head>
<title>%s &ndash; PyPI Mirror</title>
</head><body>\n""" % package_name)
for tarball, filename in versions.items():
source_path = os.path.join(pip_cache_dir, filename)
destination_path = os.path.join(destination_dir,
tarball)
dot_destination_path = os.path.join(destination_dir,
'.' + tarball)
with open(dot_destination_path, 'w') as dest:
src = open(source_path, 'r').read()
md5sum = md5.md5(src).hexdigest()
dest.write(src)
safe_name = urllib.quote(tarball)
full_html.write(full_html_line.format(dir=safe_dir,
name=safe_name))
index.write("<a href='%s#md5=%s'>%s</a>\n" %
(safe_name, md5sum, safe_name))
os.rename(dot_destination_path, destination_path)
index.write("</body></html>\n")
os.rename(os.path.join(destination_dir, ".index.html"),
os.path.join(destination_dir, "index.html"))
footer = """<p class='footer'>Generated by process_cache.py; %d
packages mirrored. </p>
</body></html>\n""" % package_count
full_html.write(footer)
full_html.close()
os.rename(os.path.join(destination_mirror, ".full.html"),
os.path.join(destination_mirror, "full.html"))
simple_html.write(footer)
simple_html.close()
os.rename(os.path.join(destination_mirror, ".index.html"),
os.path.join(destination_mirror, "index.html"))
def main():
mb = Mirror()
mb.run()
if __name__ == "__main__":
main()