Merge branch 'refactoring' of git://gitorious.org/mining-tools/gitdm into german

This commit is contained in:
Jonathan Corbet 2011-07-11 13:51:58 -06:00
commit 47ffed3cee
11 changed files with 926 additions and 159 deletions

View File

@ -13,18 +13,42 @@
import sys, re, datetime, os.path
import database
#
# Read a line and strip out junk.
#
def ReadConfigLine (file):
line = file.readline ()
if not line:
return None
line = line.split('#')[0] # Get rid of any comments
line = line.strip () # and extra white space
if len (line) == 0: # we got rid of everything
return ReadConfigLine (file)
return line
class ReadConfigLine:
"""
ReadConfigLine provides a iterator to extract line
from an config file without comments.
Typical use case:
fd = open(filename, 'r')
for line in ReadConfigLine(fd):
parse_line(line)
fd.close(fd)
"""
def __init__(self, fd):
self.fd = fd
self.buffer = None
self.patch = []
def __iter__(self):
return self
def next(self):
line = self.fd.readline()
while line:
line = line.split('#')[0] # Get rid of any comments
line = line.strip() # and extra white space
if len(line) == 0: # we got rid of everything
line = self.fd.readline()
else:
break
if not line:
raise StopIteration
return line
#
# Give up and die.
@ -38,19 +62,19 @@ def croak (message):
#
def ReadEmailAliases (name):
try:
file = open (name, 'r')
fd = open (name, 'r')
except IOError:
croak ('Unable to open email alias file %s' % (name))
line = ReadConfigLine (file)
while line:
for line in ReadConfigLine (fd):
m = re.match ('^("[^"]+"|\S+)\s+(.+)$', line)
if not m or len (m.groups ()) != 2:
croak ('Funky email alias line "%s"' % (line))
if m and m.group (2).find ('@') <= 0:
croak ('Non-addresses in email alias "%s"' % (line))
database.AddEmailAlias (m.group (1).replace ('"', ''), m.group (2))
line = ReadConfigLine (file)
file.close ()
fd.close ()
#
# The Email/Employer map
@ -59,11 +83,11 @@ EMMpat = re.compile (r'^([^\s]+)\s+([^<]+)\s*(<\s*(\d+-\d+-\d+)\s*)?$')
def ReadEmailEmployers (name):
try:
file = open (name, 'r')
fd = open (name, 'r')
except IOError:
croak ('Unable to open email/employer file %s' % (name))
line = ReadConfigLine (file)
while line:
for line in ReadConfigLine (fd):
m = EMMpat.match (line)
if not m:
croak ('Funky email/employer line "%s"' % (line))
@ -71,8 +95,8 @@ def ReadEmailEmployers (name):
company = m.group (2).strip ()
enddate = ParseDate (m.group (4))
database.AddEmailEmployerMapping (email, company, enddate)
line = ReadConfigLine (file)
file.close ()
fd.close ()
def ParseDate (cdate):
if not cdate:
@ -83,22 +107,22 @@ def ParseDate (cdate):
def ReadGroupMap (fname, employer):
try:
file = open (fname, 'r')
fd = open (fname, 'r')
except IOError:
croak ('Unable to open group map file %s' % (fname))
line = ReadConfigLine (file)
while line:
for line in ReadConfigLine (fd):
database.AddEmailEmployerMapping (line, employer)
line = ReadConfigLine (file)
file.close ()
fd.close ()
#
# Read in a virtual employer description.
#
def ReadVirtual (file, name):
def ReadVirtual (fd, name):
ve = database.VirtualEmployer (name)
line = ReadConfigLine (file)
while line:
for line in ReadConfigLine (fd):
sl = line.split (None, 1)
first = sl[0]
if first == 'end':
@ -116,23 +140,57 @@ def ReadVirtual (file, name):
if not (0 < percent <= 100):
croak ('Bad split value "%s" for virtual empl %s' % (first, name))
ve.addsplit (' '.join (sl[1:]), percent/100.0)
line = ReadConfigLine (file)
#
# We should never get here
#
croak ('Missing "end" line for virtual employer %s' % (name))
#
# Read file type patterns for more fine graned reports
#
def ReadFileType (filename):
try:
fd = open (filename, 'r')
except IOError:
croak ('Unable to open file type mapping file %s' % (filename))
patterns = {}
order = []
regex_order = re.compile ('^order\s+(.*)$')
regex_file_type = re.compile ('^filetype\s+(\S+)\s+(.+)$')
for line in ReadConfigLine (fd):
o = regex_order.match (line)
if o:
# Consider only the first definition in the config file
elements = o.group(1).replace (' ', '')
order = order or elements.split(',')
continue
m = regex_file_type.match (line)
if not m or len (m.groups ()) != 2:
ConfigFile.croak ('Funky file type line "%s"' % (line))
if not patterns.has_key (m.group (1)):
patterns[m.group (1)] = []
if m.group (1) not in order:
print '%s not found, appended to the last order' % m.group (1)
order.append (m.group (1))
patterns[m.group (1)].append (re.compile (m.group (2), re.IGNORECASE))
fd.close ()
return patterns, order
#
# Read an overall config file.
#
def ConfigFile (name, confdir):
try:
file = open (name, 'r')
fd = open (name, 'r')
except IOError:
croak ('Unable to open config file %s' % (name))
line = ReadConfigLine (file)
while line:
for line in ReadConfigLine (fd):
sline = line.split (None, 2)
if len (sline) < 2:
croak ('Funky config line: "%s"' % (line))
@ -146,7 +204,20 @@ def ConfigFile (name, confdir):
ReadGroupMap (os.path.join (confdir, sline[1]), sline[2])
elif sline[0] == 'VirtualEmployer':
ReadVirtual (file, ' '.join (sline[1:]))
elif sline[0] == 'FileTypeMap':
patterns, order = ReadFileType (os.path.join (confdir, sline[1]))
database.FileTypes = database.FileType (patterns, order)
else:
croak ('Unrecognized config line: "%s"' % (line))
line = ReadConfigLine (file)
if __name__ == '__main__':
'''Test the iterato for reading configuration files'''
try:
fd = open(sys.argv[1])
except:
croak('Usage: %s <config-file>' % sys.argv[0])
for line in ReadConfigLine(fd):
print line

47
README
View File

@ -20,6 +20,10 @@ Run it like this:
git log -p -M [details] | gitdm [options]
Alternatively, you can run with:
git log --numstat -M [details] | gitdm -n [options]
The [details] tell git which changesets are of interest; the [options] can
be:
@ -32,26 +36,35 @@ be:
By default, "./gitdm.config" is used.
-d Omit the developer reports, giving employer information
only.
only.
-D Rather than create the usual statistics, create a
file (datelc) providing lines changed per day, where the first column
displays the changes happened only on that day and the second sums
the day it happnened with the previous ones. This option is suitable
for feeding to a tool like gnuplot.
-D Rather than create the usual statistics, create a file (datelc.csv)
providing lines changed per day, where the first column displays
the changes happened only on that day and the second sums the day it
happnened with the previous ones. This option is suitable for
feeding to a tool like gnuplot.
-h file Generate HTML output to the given file
-l num Only list the top <num> entries in each report.
-n Use --numstat instead of generated patches to get the statistics.
-o file Write text output to the given file (default is stdout).
-p prefix Dump out the database categorized by changeset and by file type.
It requires -n, otherwise it is not possible to get separated results.
-r pat Only generate statistics for changes to files whose
name matches the given regular expression.
-s Ignore Signed-off-by lines which match the author of
each patch.
-t Generate a report by type of contribution (code, documentation, etc.).
It requires -n, otherwise this option is ignored silently.
-u Group all unknown developers under the "(Unknown)"
employer.
@ -68,6 +81,10 @@ looks like:
git log -p -M v2.6.19..v2.6.20 | \
gitdm -u -s -a -o results -h results.html
or:
git log --numstat -M v2.6.19..v2.6.20 | \
gitdm -u -s -a -n -o results -h results.html
CONFIGURATION FILE
@ -134,6 +151,24 @@ end
for example, no check to ensure that the percentages add up to
something rational.
FileTypeMap file
Map file names/extensions onto file types. These files contain lines
like:
order <type1>,<type2>,...,<typeN>
filetype <type> <regex>
...
This construct allows fine graned reports by type of contribution
(build, code, image, multimedia, documentation, etc.)
Order is important because it is possible to have overlapping between
filenames. For instance, ltmain.sh fits better as 'build' instead of
'code' (the filename instead of '\.sh$'). The first element in order
has precedence over the next ones.
OTHER TOOLS

40
csv.py
View File

@ -1,40 +0,0 @@
#
# aggregate per-month statistics for people
#
import sys, datetime
class CSVStat:
def __init__ (self, name, employer, date):
self.name = name
self.employer = employer
self.added = self.removed = 0
self.date = date
def accumulate (self, p):
self.added = self.added + p.added
self.removed = self.removed + p.removed
PeriodCommitHash = { }
def AccumulatePatch (p, Aggregate):
date = "%.2d-%.2d-01"%(p.date.year, p.date.month)
if (Aggregate == 'week'):
date = "%.2d-%.2d"%(p.date.isocalendar()[0], p.date.isocalendar()[1])
authdatekey = "%s-%s"%(p.author.name, date)
if authdatekey not in PeriodCommitHash:
empl = p.author.emailemployer (p.email, p.date)
stat = CSVStat (p.author.name, empl, date)
PeriodCommitHash[authdatekey] = stat
else:
stat = PeriodCommitHash[authdatekey]
stat.accumulate (p)
def OutputCSV (file):
if file is None:
return
file.write ("Name\tAffliation\tDate\tAdded\tRemoved\n")
for date, stat in PeriodCommitHash.items():
# sanitise names " is common and \" sometimes too
empl_name = stat.employer.name.replace ("\"", ".").replace ("\\", ".")
author_name = stat.name.replace ("\"", ".").replace ("\\", ".")
file.write ("\"%s\"\t\"%s\"\t%s\t%d\t%d\n"%(author_name, empl_name, stat.date, \
stat.added, stat.removed))

88
csvdump.py Normal file
View File

@ -0,0 +1,88 @@
#
# aggregate per-month statistics for people
#
import sys, datetime
import csv
class CSVStat:
def __init__ (self, name, email, employer, date):
self.name = name
self.email = email
self.employer = employer
self.added = self.removed = 0
self.date = date
def accumulate (self, p):
self.added = self.added + p.added
self.removed = self.removed + p.removed
PeriodCommitHash = { }
def AccumulatePatch (p, Aggregate):
date = "%.2d-%.2d-01"%(p.date.year, p.date.month)
if (Aggregate == 'week'):
date = "%.2d-%.2d"%(p.date.isocalendar()[0], p.date.isocalendar()[1])
authdatekey = "%s-%s"%(p.author.name, date)
if authdatekey not in PeriodCommitHash:
empl = p.author.emailemployer (p.email, p.date)
stat = CSVStat (p.author.name, p.email, empl, date)
PeriodCommitHash[authdatekey] = stat
else:
stat = PeriodCommitHash[authdatekey]
stat.accumulate (p)
ChangeSets = []
FileTypes = []
def store_patch(patch):
if not patch.merge:
employer = patch.author.emailemployer(patch.email, patch.date)
employer = employer.name.replace('"', '.').replace ('\\', '.')
author = patch.author.name.replace ('"', '.').replace ('\\', '.')
author = patch.author.name.replace ("'", '.')
try:
domain = patch.email.split('@')[1]
except:
domain = patch.email
ChangeSets.append([patch.commit, str(patch.date),
patch.email, domain, author, employer,
patch.added, patch.removed])
for (filetype, (added, removed)) in patch.filetypes.iteritems():
FileTypes.append([patch.commit, filetype, added, removed])
def save_csv (prefix='data'):
# Dump the ChangeSets
if len(ChangeSets) > 0:
fd = open('%s-changesets.csv' % prefix, 'w')
writer = csv.writer (fd, quoting=csv.QUOTE_NONNUMERIC)
writer.writerow (['Commit', 'Date', 'Domain',
'Email', 'Name', 'Affliation',
'Added', 'Removed'])
for commit in ChangeSets:
writer.writerow(commit)
# Dump the file types
if len(FileTypes) > 0:
fd = open('%s-filetypes.csv' % prefix, 'w')
writer = csv.writer (fd, quoting=csv.QUOTE_NONNUMERIC)
writer.writerow (['Commit', 'Type', 'Added', 'Removed'])
for commit in FileTypes:
writer.writerow(commit)
def OutputCSV (file):
if file is None:
return
writer = csv.writer (file, quoting=csv.QUOTE_NONNUMERIC)
writer.writerow (['Name', 'Email', 'Affliation', 'Date',
'Added', 'Removed'])
for date, stat in PeriodCommitHash.items():
# sanitise names " is common and \" sometimes too
empl_name = stat.employer.name.replace ('"', '.').replace ('\\', '.')
author_name = stat.name.replace ('"', '.').replace ('\\', '.')
writer.writerow ([author_name, stat.email, empl_name, stat.date,
stat.added, stat.removed])
__all__ = [ 'AccumulatePatch', 'OutputCSV', 'store_patch' ]

View File

@ -188,6 +188,25 @@ class VirtualEmployer (Employer):
# Should check that they add up too, but I'm lazy
Employers[self.name] = self
class FileType:
def __init__ (self, patterns={}, order=[]):
self.patterns = patterns
self.order = order
def guess_file_type (self, filename, patterns=None, order=None):
patterns = patterns or self.patterns
order = order or self.order
for file_type in order:
if patterns.has_key (file_type):
for patt in patterns[file_type]:
if patt.search (filename):
return file_type
return 'unknown'
FileTypes = None
#
# Mix all the virtual employers into their real destinations.
#

195
gitdm
View File

@ -1,4 +1,5 @@
#!/usr/bin/pypy
#-*- coding:utf-8 -*-
#
#
@ -6,15 +7,17 @@
#
# Copyright 2007-11 Eklektix, Inc.
# Copyright 2007-11 Jonathan Corbet <corbet@lwn.net>
# Copyright 2011 Germán Póo-Caamaño <gpoo@gnome.org>
#
# This file may be distributed under the terms of the GNU General
# Public License, version 2.
import database, csv, ConfigFile, reports
import database, csvdump, ConfigFile, reports
import getopt, datetime
import os, re, sys, rfc822, string
from patterns import *
import logparser
from patterns import patterns
Today = datetime.date.today()
@ -32,11 +35,14 @@ DateStats = 0
AuthorSOBs = 1
FileFilter = None
CSVFile = None
CSVPrefix = None
AkpmOverLt = 0
DumpDB = 0
CFName = 'gitdm.config'
DirName = ''
Aggregate = 'month'
Numstat = 0
ReportByFileType = 0
#
# Options:
@ -48,7 +54,9 @@ Aggregate = 'month'
# -D Output date statistics
# -h hfile HTML output to hfile
# -l count Maximum length for output lists
# -n Use numstats instead of generated patch from git log
# -o file File for text output
# -p prefix Prefix for CSV output
# -r pattern Restrict to files matching pattern
# -s Ignore author SOB lines
# -u Map unknown employers to '(Unknown)'
@ -59,9 +67,10 @@ Aggregate = 'month'
def ParseOpts ():
global MapUnknown, DevReports
global DateStats, AuthorSOBs, FileFilter, AkpmOverLt, DumpDB
global CFName, CSVFile, DirName, Aggregate
global CFName, CSVFile, CSVPrefix,DirName, Aggregate, Numstat
global ReportByFileType
opts, rest = getopt.getopt (sys.argv[1:], 'ab:dc:Dh:l:o:r:suwx:z')
opts, rest = getopt.getopt (sys.argv[1:], 'ab:dc:Dh:l:no:p:r:stuwx:z')
for opt in opts:
if opt[0] == '-a':
AkpmOverLt = 1
@ -77,13 +86,19 @@ def ParseOpts ():
reports.SetHTMLOutput (open (opt[1], 'w'))
elif opt[0] == '-l':
reports.SetMaxList (int (opt[1]))
elif opt[0] == '-n':
Numstat = 1
elif opt[0] == '-o':
reports.SetOutput (open (opt[1], 'w'))
elif opt[0] == '-p':
CSVPrefix = opt[1]
elif opt[0] == '-r':
print 'Filter on "%s"' % (opt[1])
FileFilter = re.compile (opt[1])
elif opt[0] == '-s':
AuthorSOBs = 0
elif opt[0] == '-t':
ReportByFileType = 1
elif opt[0] == '-u':
MapUnknown = 1
elif opt[0] == '-x':
@ -139,6 +154,8 @@ def PrintDateStats():
# Let's slowly try to move some smarts into this class.
#
class patch:
(ADDED, REMOVED) = range (2)
def __init__ (self, commit):
self.commit = commit
self.merge = self.added = self.removed = 0
@ -148,6 +165,7 @@ class patch:
self.reviews = [ ]
self.testers = [ ]
self.reports = [ ]
self.filetypes = {}
def addreviewer (self, reviewer):
self.reviews.append (reviewer)
@ -157,36 +175,57 @@ class patch:
def addreporter (self, reporter):
self.reports.append (reporter)
def addfiletype (self, filetype, added, removed):
if self.filetypes.has_key (filetype):
self.filetypes[filetype][self.ADDED] += added
self.filetypes[filetype][self.REMOVED] += removed
else:
self.filetypes[filetype] = [added, removed]
def parse_numstat(line, file_filter):
"""
Receive a line of text, determine if fits a numstat line and
parse the added and removed lines as well as the file type.
"""
m = patterns['numstat'].match (line)
if m:
filename = m.group (3)
# If we have a file filter, check for file lines.
if file_filter and not file_filter.search (filename):
return None, None, None, None
try:
added = int (m.group (1))
removed = int (m.group (2))
except ValueError:
# A binary file (image, etc.) is marked with '-'
added = removed = 0
m = patterns['rename'].match (filename)
if m:
filename = '%s%s%s' % (m.group (1), m.group (3), m.group (4))
filetype = database.FileTypes.guess_file_type (os.path.basename(filename))
return filename, filetype, added, removed
else:
return None, None, None, None
#
# The core hack for grabbing the information about a changeset.
#
def grabpatch():
global NextLine
while (1):
m = Pcommit.match (NextLine)
if m:
break;
NextLine = sys.stdin.readline ()
if not NextLine:
return
def grabpatch(logpatch):
m = patterns['commit'].match (logpatch[0])
if not m:
return None
p = patch(m.group (1))
NextLine = sys.stdin.readline ()
ignore = (FileFilter is not None)
while NextLine:
Line = NextLine
#
# If this line starts a new commit, drop out.
#
m = Pcommit.match (Line)
if m:
break
NextLine = sys.stdin.readline ()
for Line in logpatch[1:]:
#
# Maybe it's an author line?
#
m = Pauthor.match (Line)
m = patterns['author'].match (Line)
if m:
p.email = database.RemapEmail (m.group (2))
p.author = LookupStoreHacker(m.group (1), p.email)
@ -194,7 +233,7 @@ def grabpatch():
#
# Could be a signed-off-by:
#
m = Psob.match (Line)
m = patterns['signed-off-by'].match (Line)
if m:
email = database.RemapEmail (m.group (2))
sobber = LookupStoreHacker(m.group (1), email)
@ -204,24 +243,26 @@ def grabpatch():
#
# Various other tags of interest.
#
m = Preview.match (Line) # Reviewed-by:
m = patterns['reviewed-by'].match (Line)
if m:
email = database.RemapEmail (m.group (2))
p.addreviewer (LookupStoreHacker(m.group (1), email))
continue
m = Ptest.match (Line) # Tested-by:
m = patterns['tested-by'].match (Line)
if m:
email = database.RemapEmail (m.group (2))
p.addtester (LookupStoreHacker (m.group (1), email))
p.author.testcredit (patch)
continue
m = Prep.match (Line) # Reported-by:
# Reported-by:
m = patterns['reported-by'].match (Line)
if m:
email = database.RemapEmail (m.group (2))
p.addreporter (LookupStoreHacker (m.group (1), email))
p.author.reportcredit (patch)
continue
m = Preptest.match (Line) # Reported-and-tested-by:
# Reported-and-tested-by:
m = patterns['reported-and-tested-by'].match (Line)
if m:
email = database.RemapEmail (m.group (2))
h = LookupStoreHacker (m.group (1), email)
@ -233,14 +274,14 @@ def grabpatch():
#
# If this one is a merge, make note of the fact.
#
m = Pmerge.match (Line)
m = patterns['merge'].match (Line)
if m:
p.merge = 1
continue
#
# See if it's the date.
#
m = Pdate.match (Line)
m = patterns['date'].match (Line)
if m:
dt = rfc822.parsedate(m.group (2))
p.date = datetime.date (dt[0], dt[1], dt[2])
@ -248,20 +289,29 @@ def grabpatch():
sys.stderr.write ('Funky date: %s\n' % p.date)
p.date = Today
continue
#
# If we have a file filter, check for file lines.
#
if FileFilter:
ignore = ApplyFileFilter (Line, ignore)
#
# OK, maybe it's part of the diff itself.
#
if not ignore:
if Padd.match (Line):
p.added += 1
continue
if Prem.match (Line):
p.removed += 1
if not Numstat:
#
# If we have a file filter, check for file lines.
#
if FileFilter:
ignore = ApplyFileFilter (Line, ignore)
#
# OK, maybe it's part of the diff itself.
#
if not ignore:
if patterns['add'].match (Line):
p.added += 1
continue
if patterns['rem'].match (Line):
p.removed += 1
else:
# Get the statistics (lines added/removes) using numstats
# and without requiring a diff (--numstat instead -p)
(filename, filetype, added, removed) = parse_numstat (Line, FileFilter)
if filename:
p.added += added
p.removed += removed
p.addfiletype (filetype, added, removed)
if '@' in p.author.name:
GripeAboutAuthorName (p.author.name)
@ -279,7 +329,7 @@ def ApplyFileFilter (line, ignore):
# If this is the first file line (--- a/), set ignore one way
# or the other.
#
m = Pfilea.match (line)
m = patterns['filea'].match (line)
if m:
file = m.group (1)
if FileFilter.search (file):
@ -288,13 +338,29 @@ def ApplyFileFilter (line, ignore):
#
# For the second line, we can turn ignore off, but not on
#
m = Pfileb.match (line)
m = patterns['fileb'].match (line)
if m:
file = m.group (1)
if FileFilter.search (file):
return 0
return ignore
def is_svntag(logpatch):
"""
This is a workaround for a bug on the migration to Git
from Subversion found in GNOME. It may happen in other
repositories as well.
"""
for Line in logpatch:
m = patterns['svn-tag'].match(Line.strip())
if m:
sys.stderr.write ('(W) detected a commit on a svn tag: %s\n' %
(m.group (0),))
return True
return False
#
# If this patch is signed off by both Andrew Morton and Linus Torvalds,
# remove the (redundant) Linus signoff.
@ -324,7 +390,6 @@ if AkpmOverLt == 1:
Akpm = ('akpm@linux-foundation.org',
LookupStoreHacker ('Andrew Morton', 'akpm@linux-foundation.org'))
NextLine = sys.stdin.readline ()
TotalChanged = TotalAdded = TotalRemoved = 0
#
@ -332,12 +397,23 @@ TotalChanged = TotalAdded = TotalRemoved = 0
#
print >> sys.stderr, 'Grabbing changesets...\r',
patches = logparser.LogPatchSplitter(sys.stdin)
printcount = CSCount = 0
while (1):
for logpatch in patches:
if (printcount % 50) == 0:
print >> sys.stderr, 'Grabbing changesets...%d\r' % printcount,
printcount += 1
p = grabpatch()
# We want to ignore commits on svn tags since in Subversion
# thats mean a copy of the whole repository, which leads to
# wrong results. Some migrations from Subversion to Git does
# not catch all this tags/copy and import them just as a new
# big changeset.
if is_svntag(logpatch):
continue
p = grabpatch(logpatch)
if not p:
break
# if p.added > 100000 or p.removed > 100000:
@ -373,8 +449,9 @@ while (1):
hacker.addtested (p)
for hacker in p.reports:
hacker.addreport (p)
CSCount += 1
csv.AccumulatePatch (p, Aggregate)
CSCount += 1
csvdump.AccumulatePatch (p, Aggregate)
csvdump.store_patch (p)
print >> sys.stderr, 'Grabbing changesets...done '
if DumpDB:
@ -403,10 +480,16 @@ if TotalChanged == 0:
if DateStats:
PrintDateStats ()
csv.OutputCSV (CSVFile)
if CSVFile is not None:
CSVFile.close ()
if CSVPrefix:
csvdump.save_csv (CSVPrefix)
if CSVFile:
csvdump.OutputCSV (CSVFile)
CSVFile.close ()
if DevReports:
reports.DevReports (hlist, TotalChanged, CSCount, TotalRemoved)
reports.EmplReports (elist, TotalChanged, CSCount)
if ReportByFileType and Numstat:
reports.ReportByFileType (hlist)

View File

@ -20,3 +20,8 @@ EmailMap sample-config/domain-map
#
# GroupMap sample-config/illuminati The Illuminati
#
#
# Use FileTypeMap to map a file types to file names using regular
# regular expressions.
#
FileTypeMap sample-config/filetypes.txt

90
logparser.py Normal file
View File

@ -0,0 +1,90 @@
#!/usr/bin/env python
#-*- coding:utf-8 -*-
#
# Copyright © 2009 Germán Póo-Caamaño <gpoo@gnome.org>
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Library General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
import sys
from patterns import patterns
class LogPatchSplitter:
"""
LogPatchSplitters provides a iterator to extract every
changeset from a git log output.
Typical use case:
patches = LogPatchSplitter(sys.stdin)
for patch in patches:
parse_patch(patch)
"""
def __init__(self, fd):
self.fd = fd
self.buffer = None
self.patch = []
def __iter__(self):
return self
def next(self):
patch = self.__grab_patch__()
if not patch:
raise StopIteration
return patch
def __grab_patch__(self):
"""
Extract a patch from the file descriptor and the
patch is returned as a list of lines.
"""
patch = []
line = self.buffer or self.fd.readline()
while line:
m = patterns['commit'].match(line)
if m:
patch = [line]
break
line = self.fd.readline()
if not line:
return None
line = self.fd.readline()
while line:
# If this line starts a new commit, drop out.
m = patterns['commit'].match(line)
if m:
self.buffer = line
break
patch.append(line)
self.buffer = None
line = self.fd.readline()
return patch
if __name__ == '__main__':
patches = LogPatchSplitter(sys.stdin)
for patch in patches:
print '---------- NEW PATCH ----------'
for line in patch:
print line,

View File

@ -1,10 +1,12 @@
#
# -*- coding:utf-8 -*-
# Pull together regular expressions used in multiple places.
#
# This code is part of the LWN git data miner.
#
# Copyright 2007-11 Eklektix, Inc.
# Copyright 2007-11 Jonathan Corbet <corbet@lwn.net>
# Copyright 2011 Germán Póo-Caamaño <gpoo@gnome.org>
#
# This file may be distributed under the terms of the GNU General
# Public License, version 2.
@ -16,24 +18,34 @@ import re
# expressions." Now they have two problems.
# -- Jamie Zawinski
#
Pemail = r'\s+"?([^<"]+)"?\s<([^>]+)>' # just email addr + name
Pcommit = re.compile (r'^commit ([0-9a-f ]+)$')
Pauthor = re.compile (r'^Author:' + Pemail + '$')
Psob = re.compile (r'^\s+Signed-off-by:' + Pemail + '.*$')
Pmerge = re.compile (r'^Merge:.*$')
Padd = re.compile (r'^\+[^+].*$')
Prem = re.compile (r'^-[^-].*$')
Pdate = re.compile (r'^(Commit)?Date:\s+(.*)$')
Pfilea = re.compile (r'^---\s+(.*)$')
Pfileb = re.compile (r'^\+\+\+\s+(.*)$')
Preview = re.compile (r'^\s+Reviewed-by:' + Pemail + '.*$')
Ptest = re.compile (r'^\s+tested-by:' + Pemail + '.*$', re.I)
Prep = re.compile (r'^\s+Reported-by:' + Pemail + '.*$')
Preptest = re.compile (r'^\s+reported-and-tested-by:' + Pemail + '.*$', re.I)
#
# Merges are described with a variety of lines.
#
PExtMerge = re.compile(r'^ +Merge( branch .* of)? ([^ ]+:[^ ]+)\n$')
PIntMerge = re.compile(r'^ +(Merge|Pull) .* into .*$')
# PIntMerge2 = re.compile(r"^ +Merge branch(es)? '.*$")
PIntMerge2 = re.compile(r"^ +Merge .*$")
_pemail = r'\s+"?([^<"]+)"?\s<([^>]+)>' # just email addr + name
patterns = {
'commit': re.compile (r'^commit ([0-9a-f ]+)$'),
'author': re.compile (r'^Author:' + _pemail + '$'),
'signed-off-by': re.compile (r'^\s+Signed-off-by:' + _pemail + '.*$'),
'merge': re.compile (r'^Merge:.*$'),
'add': re.compile (r'^\+[^+].*$'),
'rem': re.compile (r'^-[^-].*$'),
'date': re.compile (r'^(Commit)?Date:\s+(.*)$'),
# filea, fileb are used only in 'parche mode' (-p)
'filea': re.compile (r'^---\s+(.*)$'),
'fileb': re.compile (r'^\+\+\+\s+(.*)$'),
'reviewed-by': re.compile (r'^\s+Reviewed-by:' + _pemail+ '.*$'),
'tested-by': re.compile (r'^\s+tested-by:' + _pemail + '.*$', re.I),
'reported-by': re.compile (r'^\s+Reported-by:' + _pemail + '.*$'),
'reported-and-tested-by': re.compile (r'^\s+reported-and-tested-by:' + _pemail + '.*$', re.I),
#
# Merges are described with a variety of lines.
#
'ExtMerge': re.compile(r'^ +Merge( branch .* of)? ([^ ]+:[^ ]+)\n$'),
'IntMerge': re.compile(r'^ +(Merge|Pull) .* into .*$'),
# PIntMerge2 = re.compile(r"^ +Merge branch(es)? '.*$"),
'IntMerge2': re.compile(r"^ +Merge .*$"),
# Another way to get the statistics (per file).
# It implies --numstat
'numstat': re.compile('^(\d+|-)\s+(\d+|-)\s+(.*)$'),
'rename' : re.compile('(.*)\{(.*) => (.*)\}(.*)'),
# Detect errors on svn conversions
'svn-tag': re.compile("^svn path=/tags/(.*)/?; revision=([0-9]+)$"),
}

View File

@ -340,4 +340,46 @@ def EmplReports (elist, totalchanged, cscount):
ReportByELChanged (elist, totalchanged)
ReportByESOBs (elist)
ReportByEHackers (elist)
def ReportByFileType (hacker_list):
total = {}
total_by_hacker = {}
BeginReport ('Developer contributions by type')
for h in hacker_list:
by_hacker = {}
for patch in h.patches:
# Get a summary by hacker
for (filetype, (added, removed)) in patch.filetypes.iteritems():
if by_hacker.has_key(filetype):
by_hacker[filetype][patch.ADDED] += added
by_hacker[filetype][patch.REMOVED] += removed
else:
by_hacker[filetype] = [added, removed]
# Update the totals
if total.has_key(filetype):
total[filetype][patch.ADDED] += added
total[filetype][patch.REMOVED] += removed
else:
total[filetype] = [added, removed, []]
# Print a summary by hacker
print h.name
for filetype, counters in by_hacker.iteritems():
print '\t', filetype, counters
h_added = by_hacker[filetype][patch.ADDED]
h_removed = by_hacker[filetype][patch.REMOVED]
total[filetype][2].append ([h.name, h_added, h_removed])
# Print the global summary
BeginReport ('Contributions by type and developers')
for filetype, (added, removed, hackers) in total.iteritems():
print filetype, added, removed
for h, h_added, h_removed in hackers:
print '\t%s: [%d, %d]' % (h, h_added, h_removed)
# Print the very global summary
BeginReport ('General contributions by type')
for filetype, (added, removed, hackers) in total.iteritems():
print filetype, added, removed

362
sample-config/filetypes.txt Normal file
View File

@ -0,0 +1,362 @@
# -*- coding:utf-8 -*-
# Copyright (C) 2006 Libresoft
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Library General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
#
# Authors : Gregorio Robles <grex@gsyc.escet.urjc.es>
# Authors : Germán Póo-Caamaño <gpoo@gnome.org>
#
# This file contains associations parameters regarding filetypes
# (documentation, develompent, multimedia, images...)
#
# format:
# filetype <type> <regex> [<comment>]
#
# Order:
# The list should keep an order, so filetypes can be counted properly.
# ie. we want ltmain.sh -> 'build' instead of 'code'.
#
# If there is an filetype which is not in order but has values, it will
# be added at the end.
#
order image,translation,ui,multimedia,package,build,code,documentation,devel-doc
#
#
# Code files (headers and the like included
# (most common languages first
#
filetype code \.c$ # C
filetype code \.pc$ # C
filetype code \.ec$ # C
filetype code \.ecp$ # C
filetype code \.C$ # C++
filetype code \.cpp$ # C++
filetype code \.c\+\+$ # C++
filetype code \.cxx$ # C++
filetype code \.cc$ # C++
filetype code \.pcc$ # C++
filetype code \.cpy$ # C++
filetype code \.h$ # C or C++ header
filetype code \.hh$ # C++ header
filetype code \.hpp$ # C++ header
filetype code \.hxx$ # C++ header
filetype code \.sh$ # Shell
filetype code \.pl$ # Perl
filetype code \.pm$ # Perl
filetype code \.pod$ # Perl
filetype code \.perl$ # Perl
filetype code \.cgi$ # CGI
filetype code \.php$ # PHP
filetype code \.php3$ # PHP
filetype code \.php4$ # PHP
filetype code \.inc$ # PHP
filetype code \.py$ # Python
filetype code \.java$ # Java
filetype code \.class$ # Java Class (or at least a class in some OOPL
filetype code \.ada$ # ADA
filetype code \.ads$ # ADA
filetype code \.adb$ # ADA
filetype code \.pad$ # ADA
filetype code \.s$ # Assembly
filetype code \.S$ # Assembly
filetype code \.asm$ # Assembly
filetype code \.awk$ # awk
filetype code \.cs$ # C#
filetype code \.csh$ # CShell (including tcsh
filetype code \.cob$ # COBOL
filetype code \.cbl$ # COBOL
filetype code \.COB$ # COBOL
filetype code \.CBL$ # COBOL
filetype code \.exp$ # Expect
filetype code \.l$ # (F lex
filetype code \.ll$ # (F lex
filetype code \.lex$ # (F lex
filetype code \.f$ # Fortran
filetype code \.f77$ # Fortran
filetype code \.F$ # Fortran
filetype code \.hs$ # Haskell
filetype code \.lhs$ # Not preprocessed Haskell
filetype code \.el$ # LISP (including Scheme
filetype code \.scm$ # LISP (including Scheme
filetype code \.lsp$ # LISP (including Scheme
filetype code \.jl$ # LISP (including Scheme
filetype code \.ml$ # ML
filetype code \.ml3$ # ML
filetype code \.m3$ # Modula3
filetype code \.i3$ # Modula3
filetype code \.m$ # Objective-C
filetype code \.p$ # Pascal
filetype code \.pas$ # Pascal
filetype code \.rb$ # Ruby
filetype code \.sed$ # sed
filetype code \.tcl$ # TCL
filetype code \.tk$ # TCL
filetype code \.itk$ # TCL
filetype code \.y$ # Yacc
filetype code \.yy$ # Yacc
filetype code \.idl$ # CORBA IDL
filetype code \.gnorba$ # GNOME CORBA IDL
filetype code \.oafinfo$ # GNOME OAF
filetype code \.mcopclass$ # MCOP IDL compiler generated class
filetype code \.autoforms$ # Autoform
filetype code \.atf$ # Autoform
filetype code \.gnuplot$
filetype code \.xs$ # Shared library? Seen a lot of them in gnome-perl
filetype code \.js$ # JavaScript (and who knows, maybe more
filetype code \.patch$
filetype code \.diff$ # Sometimes patches appear this way
filetype code \.ids$ # Not really sure what this means
filetype code \.upd$ # ¿¿¿??? (from Kcontrol
filetype code $.ad$ # ¿¿¿??? (from Kdisplay and mc
filetype code $.i$ # Appears in the kbindings for Qt
filetype code $.pri$ # from Qt
filetype code \.schema$ # Not really sure what this means
filetype code \.fd$ # Something to do with latex
filetype code \.cls$ # Something to do with latex
filetype code \.pro$ # Postscript generation
filetype code \.ppd$ # PDF generation
filetype code \.dlg$ # Not really sure what this means
filetype code \.plugin$ # Plug-in file
filetype code \.dsp # Microsoft Developer Studio Project File
filetype code \.vim$ # vim syntax file
filetype code \.trm$ # gnuplot term file
filetype code \.font$ # Font mapping
filetype code \.ccg$ # C++ files - Found in gtkmm*
filetype code \.hg$ # C++ headers - Found in gtkmm*
filetype code \.dtd # XML Document Type Definition
filetype code \.bat # DOS batch files
filetype code \.vala # Vala
filetype code \.py\.in$
filetype code \.rhtml$ # eRuby
filetype code \.sql$ # SQL script
#
#
# Development documentation files (for hacking generally
#
filetype devel-doc ^readme.*$
filetype devel-doc ^changelog.*
filetype devel-doc ^todo.*$
filetype devel-doc ^credits.*$
filetype devel-doc ^authors.*$
filetype devel-doc ^changes.*$
filetype devel-doc ^news.*$
filetype devel-doc ^install.*$
filetype devel-doc ^hacking.*$
filetype devel-doc ^copyright.*$
filetype devel-doc ^licen(s|c)e.*$
filetype devel-doc ^copying.*$
filetype devel-doc manifest$
filetype devel-doc faq$
filetype devel-doc building$
filetype devel-doc howto$
filetype devel-doc design$
filetype devel-doc \.files$
filetype devel-doc files$
filetype devel-doc subdirs$
filetype devel-doc maintainers$
filetype devel-doc developers$
filetype devel-doc contributors$
filetype devel-doc thanks$
filetype devel-doc releasing$
filetype devel-doc test$
filetype devel-doc testing$
filetype devel-doc build$
filetype devel-doc comments?$
filetype devel-doc bugs$
filetype devel-doc buglist$
filetype devel-doc problems$
filetype devel-doc debug$
filetype devel-doc hacks$
filetype devel-doc hacking$
filetype devel-doc versions?$
filetype devel-doc mappings$
filetype devel-doc tips$
filetype devel-doc ideas?$
filetype devel-doc spec$
filetype devel-doc compiling$
filetype devel-doc notes$
filetype devel-doc missing$
filetype devel-doc done$
filetype devel-doc \.omf$ # XML-based format used in GNOME
filetype devel-doc \.lsm$
filetype devel-doc ^doxyfile$
filetype devel-doc \.kdevprj$
filetype devel-doc \.directory$
filetype devel-doc \.dox$
filetype devel-doc \.doap$
#
#
# Building, compiling, configuration and CVS admin files
#
filetype build \.in.*$
filetype build configure.*$
filetype build makefile.*$
filetype build config\.sub$
filetype build config\.guess$
filetype build config\.status$
filetype build ltmain\.sh$
filetype build autogen\.sh$
filetype build config$
filetype build conf$
filetype build cvsignore$
filetype build \.cfg$
filetype build \.m4$
filetype build \.mk$
filetype build \.mak$
filetype build \.make$
filetype build \.mbx$
filetype build \.protocol$
filetype build \.version$
filetype build mkinstalldirs$
filetype build install-sh$
filetype build rules$
filetype build \.kdelnk$
filetype build \.menu$
filetype build linguas$ # Build translations
filetype build potfiles.*$ # Build translations
filetype build \.shlibs$ # Shared libraries
# filetype build %debian%
# filetype build %specs/%
filetype build \.spec$ # It seems theyre necessary for RPM build
filetype build \.def$ # build bootstrap for DLLs on win32
#
#
# Documentation files
#
# filetype documentation doc/%
# filetype documentation %HOWTO%
filetype documentation \.html$
filetype documentation \.txt$
filetype documentation \.ps(\.gz|\.bz2)?$
filetype documentation \.dvi(\.gz|\.bz2)?$
filetype documentation \.lyx$
filetype documentation \.tex$
filetype documentation \.texi$
filetype documentation \.pdf(\.gz|\.bz2)?$
filetype documentation \.djvu$
filetype documentation \.epub$
filetype documentation \.sgml$
filetype documentation \.docbook$
filetype documentation \.wml$
filetype documentation \.xhtml$
filetype documentation \.phtml$
filetype documentation \.shtml$
filetype documentation \.htm$
filetype documentation \.rdf$
filetype documentation \.phtm$
filetype documentation \.tmpl$
filetype documentation \.ref$ # References
filetype documentation \.css$
# filetype documentation %tutorial%
filetype documentation \.templates$
filetype documentation \.dsl$
filetype documentation \.ent$
filetype documentation \.xml$
filetype documentation \.xmi$
filetype documentation \.xsl$
filetype documentation \.entities$
filetype documentation \.[1-7]$ # Man pages
filetype documentation \.man$
filetype documentation \.manpages$
filetype documentation \.doc$
filetype documentation \.rtf$
filetype documentation \.wpd$
filetype documentation \.qt3$
filetype documentation man\d?/.*\.\d$
filetype documentation \.docs$
filetype documentation \.sdw$ # OpenOffice.org Writer document
filetype documentation \.odt$ # OpenOffice.org document
filetype documentation \.en$ # Files in English language
filetype documentation \.de$ # Files in German
filetype documentation \.es$ # Files in Spanish
filetype documentation \.fr$ # Files in French
filetype documentation \.it$ # Files in Italian
filetype documentation \.cz$ # Files in Czech
filetype documentation \.page$ # Mallard
filetype documentation \.page.stub$ # Mallard stub
#
#
# Images
#
filetype image \.png$
filetype image \.jpg$
filetype image \.jpeg$
filetype image \.bmp$
filetype image \.gif$
filetype image \.xbm$
filetype image \.eps$
filetype image \.mng$
filetype image \.pnm$
filetype image \.pbm$
filetype image \.ppm$
filetype image \.pgm$
filetype image \.gbr$
filetype image \.svg$
filetype image \.fig$
filetype image \.tif$
filetype image \.swf$
filetype image \.svgz$
filetype image \.shape$ # XML files used for shapes for instance in Kivio
filetype image \.sml$ # XML files used for shapes for instance in Kivio
filetype image \.bdf$ # vfontcap - Vector Font Capability Database (VFlib Version 2
filetype image \.ico$
filetype image \.dia$ # We consider .dia as images, I dont want them in unknown
#
#
# Translation files
#
filetype translation \.po$
filetype translation \.pot$
filetype translation \.charset$
filetype translation \.mo$
#
#
# User interface files
#
filetype ui \.desktop$
filetype ui \.ui$
filetype ui \.xpm$
filetype ui \.xcf$
filetype ui \.3ds$
filetype ui \.theme$
filetype ui \.kimap$
filetype ui \.glade$
filetype ui \.gtkbuilder$
filetype ui rc$
#
#
# Sound files
#
filetype multimedia \.mp3$
filetype multimedia \.ogg$
filetype multimedia \.wav$
filetype multimedia \.au$
filetype multimedia \.mid$
filetype multimedia \.vorbis$
filetype multimedia \.midi$
filetype multimedia \.arts$
#
#
# Packages (yes, there are people who upload packages to the repo)
#
filetype package \.tar$
filetype package \.tar.gz$
filetype package \.tar.bz2$
filetype package \.tar.xz$
filetype package \.tgz$
filetype package \.deb$
filetype package \.rpm$
filetype package \.srpm$
filetype package \.ebuild$