gitdm/gitdm

378 lines
10 KiB
Python
Executable File

#!/usr/bin/python
#
#
# This code is part of the LWN git data miner.
#
# Copyright 2007-8 LWN.net
# Copyright 2007-8 Jonathan Corbet <corbet@lwn.net>
#
# This file may be distributed under the terms of the GNU General
# Public License, version 2.
import database, csv, ConfigFile, reports
import getopt, datetime
import os, re, sys, rfc822, string
from patterns import *
Today = datetime.date.today()
#
# Control options.
#
MapUnknown = 0
DevReports = 1
DateStats = 0
AuthorSOBs = 1
FileFilter = None
CSVFile = None
AkpmOverLt = 0
DumpDB = 0
CFName = 'gitdm.config'
#
# Options:
#
# -a Andrew Morton's signoffs shadow Linus's
# -c cfile Specify a configuration file
# -d Output individual developer stats
# -D Output date statistics
# -h hfile HTML output to hfile
# -l count Maximum length for output lists
# -o file File for text output
# -r pattern Restrict to files matching pattern
# -s Ignore author SOB lines
# -u Map unknown employers to '(Unknown)'
# -x file.csv Export raw statistics as CSV
# -z Dump out the hacker database at completion
def ParseOpts ():
global MapUnknown, DevReports
global DateStats, AuthorSOBs, FileFilter, AkpmOverLt, DumpDB
global CFName, CSVFile
opts, rest = getopt.getopt (sys.argv[1:], 'adc:Dh:l:o:r:sux:z')
for opt in opts:
if opt[0] == '-a':
AkpmOverLt = 1
elif opt[0] == '-c':
CFName = opt[1]
elif opt[0] == '-d':
DevReports = 0
elif opt[0] == '-D':
DateStats = 1
elif opt[0] == '-h':
reports.SetHTMLOutput (open (opt[1], 'w'))
elif opt[0] == '-l':
reports.SetMaxList (int (opt[1]))
elif opt[0] == '-o':
reports.SetOutput (open (opt[1], 'w'))
elif opt[0] == '-r':
print 'Filter on "%s"' % (opt[1])
FileFilter = re.compile (opt[1])
elif opt[0] == '-s':
AuthorSOBs = 0
elif opt[0] == '-u':
MapUnknown = 1
elif opt[0] == '-x':
CSVFile = open (opt[1], 'w')
print "open output file " + opt[1] + "\n"
elif opt[0] == '-z':
DumpDB = 1
def LookupStoreHacker (name, email):
email = database.RemapEmail (email)
h = database.LookupEmail (email)
if h: # already there
return h
elist = database.LookupEmployer (email, MapUnknown)
h = database.LookupName (name)
if h: # new email
h.addemail (email, elist)
return h
return database.StoreHacker(name, elist, email)
#
# Date tracking.
#
DateMap = { }
def AddDateLines(date, lines):
if lines > 1000000:
print 'Skip big patch (%d)' % lines
return
try:
DateMap[date] += lines
except KeyError:
DateMap[date] = lines
def PrintDateStats():
dates = DateMap.keys ()
dates.sort ()
total = 0
datef = open ('datelc', 'w')
for date in dates:
total += DateMap[date]
datef.write ('%d/%02d/%02d %6d %7d\n' % (date.year, date.month, date.day,
DateMap[date], total))
#
# Let's slowly try to move some smarts into this class.
#
class patch:
def __init__ (self, commit):
self.commit = commit
self.merge = self.added = self.removed = 0
self.author = LookupStoreHacker('Unknown hacker', 'unknown@hacker.net')
self.email = 'unknown@hacker.net'
self.sobs = [ ]
self.reviews = [ ]
self.testers = [ ]
self.reports = [ ]
def addreviewer (self, reviewer):
self.reviews.append (reviewer)
def addtester (self, tester):
self.testers.append (tester)
def addreporter (self, reporter):
self.reports.append (reporter)
#
# The core hack for grabbing the information about a changeset.
#
def grabpatch():
global NextLine, TotalAdded, TotalRemoved, TotalChanged
while (1):
m = Pcommit.match (NextLine)
if m:
break;
NextLine = sys.stdin.readline ()
if not NextLine:
return
p = patch(m.group (1))
NextLine = sys.stdin.readline ()
ignore = (FileFilter is not None)
while NextLine:
Line = NextLine
#
# If this line starts a new commit, drop out.
#
m = Pcommit.match (Line)
if m:
break
NextLine = sys.stdin.readline ()
#
# Maybe it's an author line?
#
m = Pauthor.match (Line)
if m:
p.email = database.RemapEmail (m.group (2))
p.author = LookupStoreHacker(m.group (1), p.email)
continue
#
# Could be a signed-off-by:
#
m = Psob.search (Line)
if m:
email = database.RemapEmail (m.group (2))
sobber = LookupStoreHacker(m.group (1), email)
if sobber != p.author or AuthorSOBs:
p.sobs.append ((email, LookupStoreHacker(m.group (1), m.group (2))))
continue
#
# Various other tags of interest.
#
m = Preview.search (Line) # Reviewed-by:
if m:
email = database.RemapEmail (m.group (2))
p.addreviewer (LookupStoreHacker(m.group (1), email))
continue
m = Ptest.search (Line) # Tested-by:
if m:
email = database.RemapEmail (m.group (2))
p.addtester (LookupStoreHacker (m.group (1), email))
p.author.testcredit (patch)
continue
m = Prep.search (Line) # Reported-by:
if m:
email = database.RemapEmail (m.group (2))
p.addreporter (LookupStoreHacker (m.group (1), email))
p.author.reportcredit (patch)
continue
m = Preptest.search (Line) # Reported-and-tested-by:
if m:
email = database.RemapEmail (m.group (2))
h = LookupStoreHacker (m.group (1), email)
p.addreporter (h)
p.addtester (h)
p.author.reportcredit (patch)
p.author.testcredit (patch)
continue
#
# If this one is a merge, make note of the fact.
#
m = Pmerge.match (Line)
if m:
p.merge = 1
continue
#
# See if it's the date.
#
m = Pdate.match (Line)
if m:
dt = rfc822.parsedate(m.group (2))
p.date = datetime.date (dt[0], dt[1], dt[2])
if p.date > Today:
sys.stderr.write ('Funky date: %s\n' % p.date)
p.date = Today
continue
#
# If we have a file filter, check for file lines.
#
if FileFilter:
ignore = ApplyFileFilter (Line, ignore)
#
# OK, maybe it's part of the diff itself.
#
if not ignore:
if Padd.match (Line):
p.added += 1
continue
if Prem.match (Line):
p.removed += 1
#
# Record some global information - but only if this patch had
# stuff which wasn't ignored. This work should be done
# elsewhere,
#
if ((p.added + p.removed) > 0 or not FileFilter) and not p.merge:
TotalAdded += p.added
TotalRemoved += p.removed
TotalChanged += max (p.added, p.removed)
AddDateLines (p.date, max (p.added, p.removed))
empl = p.author.emailemployer (p.email, p.date)
empl.AddCSet (p)
if AkpmOverLt:
TrimLTSOBs (p)
for sobemail, sobber in p.sobs:
empl = sobber.emailemployer (sobemail, p.date)
empl.AddSOB()
return p
def ApplyFileFilter (line, ignore):
#
# If this is the first file line (--- a/), set ignore one way
# or the other.
#
m = Pfilea.match (line)
if m:
file = m.group (1)
if FileFilter.search (file):
return 0
return 1
#
# For the second line, we can turn ignore off, but not on
#
m = Pfileb.match (line)
if m:
file = m.group (1)
if FileFilter.search (file):
return 0
return ignore
#
# If this patch is signed off by both Andrew Morton and Linus Torvalds,
# remove the (redundant) Linus signoff.
#
def TrimLTSOBs (p):
if Linus in p.sobs and Akpm in p.sobs:
p.sobs.remove (Linus)
#
# Here starts the real program.
#
ParseOpts ()
#
# Read the config files.
#
ConfigFile.ConfigFile (CFName)
#
# Let's pre-seed the database with a couple of hackers
# we want to remember.
#
Linus = ('torvalds@linux-foundation.org',
LookupStoreHacker ('Linus Torvalds', 'torvalds@linux-foundation.org'))
Akpm = ('akpm@linux-foundation.org',
LookupStoreHacker ('Andrew Morton', 'akpm@linux-foundation.org'))
NextLine = sys.stdin.readline ()
TotalChanged = TotalAdded = TotalRemoved = 0
#
# Snarf changesets.
#
print >> sys.stderr, 'Grabbing changesets...\r',
printcount = CSCount = 0
while (1):
if (printcount % 50) == 0:
print >> sys.stderr, 'Grabbing changesets...%d\r' % printcount,
printcount += 1
p = grabpatch()
if not p:
break
# if p.added > 100000 or p.removed > 100000:
# print 'Skipping massive add', p.commit
# continue
if FileFilter and p.added == 0 and p.removed == 0:
continue
if not p.merge:
p.author.addpatch (p)
for sobemail, sob in p.sobs:
sob.addsob (p)
for hacker in p.reviews:
hacker.addreview (p)
for hacker in p.testers:
hacker.addtested (p)
for hacker in p.reports:
hacker.addreport (p)
CSCount += 1
csv.AccumulatePatch (p)
print >> sys.stderr, 'Grabbing changesets...done'
if DumpDB:
database.DumpDB ()
#
# Say something
#
hlist = database.AllHackers ()
elist = database.AllEmployers ()
reports.Write ('Processed %d csets from %d developers\n' % (CSCount,
len (hlist)))
reports.Write ('%d employers found\n' % len (elist))
reports.Write ('A total of %d lines added, %d removed (delta %d)\n' %
(TotalAdded, TotalRemoved, TotalAdded - TotalRemoved))
if TotalChanged == 0:
TotalChanged = 1 # HACK to avoid div by zero
if DateStats:
PrintDateStats ()
sys.exit(0)
csv.OutputCSV (CSVFile)
if CSVFile is not None:
CSVFile.close ()
if DevReports:
reports.DevReports (hlist, TotalChanged, CSCount, TotalRemoved)
reports.EmplReports (elist, TotalChanged, CSCount)