Merge branch 'refactoring' of git://gitorious.org/mining-tools/gitdm into german

2011-07-11 13:51:58 -06:00 · 2011-07-11 13:51:58 -06:00 · 47ffed3cee
parent 85004f0f9b 69f9ad7e64
commit 47ffed3cee
11 changed files with 926 additions and 159 deletions
--- a/ConfigFile.py
+++ b/ConfigFile.py
@ -13,18 +13,42 @@
 import sys, re, datetime, os.path
 import database

-#
-# Read a line and strip out junk.
-#
-def ReadConfigLine (file):
-    line = file.readline ()
-    if not line:
-        return None
-    line = line.split('#')[0] # Get rid of any comments
-    line = line.strip () # and extra white space
-    if len (line) == 0: # we got rid of everything
-        return ReadConfigLine (file)
-    return line
+class ReadConfigLine:
+    """
+        ReadConfigLine provides a iterator to extract line
+        from an config file without comments.
+
+        Typical use case:
+
+            fd = open(filename, 'r')
+            for line in ReadConfigLine(fd):
+                parse_line(line)
+            fd.close(fd)
+    """
+
+    def __init__(self, fd):
+        self.fd = fd
+        self.buffer = None
+        self.patch = []
+
+    def __iter__(self):
+        return self
+
+    def next(self):
+        line = self.fd.readline()
+        while line:
+            line = line.split('#')[0] # Get rid of any comments
+            line = line.strip()       # and extra white space
+            if len(line) == 0:       # we got rid of everything
+                line = self.fd.readline()
+            else:
+                break
+
+        if not line:
+            raise StopIteration
+
+        return line
+

 #
 # Give up and die.
@ -38,19 +62,19 @@ def croak (message):
 #
 def ReadEmailAliases (name):
    try:
-        file = open (name, 'r')
+        fd = open (name, 'r')
    except IOError:
        croak ('Unable to open email alias file %s' % (name))
-    line = ReadConfigLine (file)
-    while line:
+
+    for line in ReadConfigLine (fd):
        m = re.match ('^("[^"]+"|\S+)\s+(.+)$', line)
        if not m or len (m.groups ()) != 2:
            croak ('Funky email alias line "%s"' % (line))
        if m and m.group (2).find ('@') <= 0:
            croak ('Non-addresses in email alias "%s"' % (line))
        database.AddEmailAlias (m.group (1).replace ('"', ''), m.group (2))
-        line = ReadConfigLine (file)
-    file.close ()
+ 
+    fd.close ()

 #
 # The Email/Employer map
@ -59,11 +83,11 @@ EMMpat = re.compile (r'^([^\s]+)\s+([^<]+)\s*(<\s*(\d+-\d+-\d+)\s*)?$')

 def ReadEmailEmployers (name):
    try:
-        file = open (name, 'r')
+        fd = open (name, 'r')
    except IOError:
        croak ('Unable to open email/employer file %s' % (name))
-    line = ReadConfigLine (file)
-    while line:
+
+    for line in ReadConfigLine (fd):
        m = EMMpat.match (line)
        if not m:
            croak ('Funky email/employer line "%s"' % (line))
@ -71,8 +95,8 @@ def ReadEmailEmployers (name):
        company = m.group (2).strip ()
        enddate = ParseDate (m.group (4))
        database.AddEmailEmployerMapping (email, company, enddate)
-        line = ReadConfigLine (file)
-    file.close ()
+ 
+    fd.close ()

 def ParseDate (cdate):
    if not cdate:
@ -83,22 +107,22 @@ def ParseDate (cdate):

 def ReadGroupMap (fname, employer):
    try:
-        file = open (fname, 'r')
+        fd = open (fname, 'r')
    except IOError:
        croak ('Unable to open group map file %s' % (fname))
-    line = ReadConfigLine (file)
-    while line:
+
+    for line in ReadConfigLine (fd):
        database.AddEmailEmployerMapping (line, employer)
-        line = ReadConfigLine (file)
-    file.close ()
+
+    fd.close ()

 #
 # Read in a virtual employer description.
 #
-def ReadVirtual (file, name):
+def ReadVirtual (fd, name):
    ve = database.VirtualEmployer (name)
-    line = ReadConfigLine (file)
-    while line:
+
+    for line in ReadConfigLine (fd):
        sl = line.split (None, 1)
        first = sl[0]
        if first == 'end':
@ -116,23 +140,57 @@ def ReadVirtual (file, name):
        if not (0 < percent <= 100):
            croak ('Bad split value "%s" for virtual empl %s' % (first, name))
        ve.addsplit (' '.join (sl[1:]), percent/100.0)
-        line = ReadConfigLine (file)
    #
    # We should never get here
    #
    croak ('Missing "end" line for virtual employer %s' % (name))

+#
+# Read file type patterns for more fine graned reports
+#
+def ReadFileType (filename):
+    try:
+        fd = open (filename, 'r')
+    except IOError:
+        croak ('Unable to open file type mapping file %s' % (filename))
+    patterns = {}
+    order = []
+    regex_order = re.compile ('^order\s+(.*)$')
+    regex_file_type = re.compile ('^filetype\s+(\S+)\s+(.+)$')
+
+    for line in ReadConfigLine (fd):
+        o = regex_order.match (line)
+        if o:
+            # Consider only the first definition in the config file
+            elements = o.group(1).replace (' ', '')
+            order = order or elements.split(',')
+            continue
+
+        m = regex_file_type.match (line)
+        if not m or len (m.groups ()) != 2:
+            ConfigFile.croak ('Funky file type line "%s"' % (line))
+        if not patterns.has_key (m.group (1)):
+            patterns[m.group (1)] = []
+        if m.group (1) not in order:
+            print '%s not found, appended to the last order' % m.group (1)
+            order.append (m.group (1))
+
+        patterns[m.group (1)].append (re.compile (m.group (2), re.IGNORECASE))
+
+    fd.close ()
+    return patterns, order
+
 #
 # Read an overall config file.
 #

 def ConfigFile (name, confdir):
    try:
-        file = open (name, 'r')
+        fd = open (name, 'r')
    except IOError:
        croak ('Unable to open config file %s' % (name))
-    line = ReadConfigLine (file)
-    while line:
+
+    for line in ReadConfigLine (fd):
        sline = line.split (None, 2)
        if len (sline) < 2:
            croak ('Funky config line: "%s"' % (line))
@ -146,7 +204,20 @@ def ConfigFile (name, confdir):
            ReadGroupMap (os.path.join (confdir, sline[1]), sline[2])
        elif sline[0] == 'VirtualEmployer':
            ReadVirtual (file, ' '.join (sline[1:]))
+        elif sline[0] == 'FileTypeMap':
+            patterns, order = ReadFileType (os.path.join (confdir, sline[1]))
+            database.FileTypes = database.FileType (patterns, order)
        else:
            croak ('Unrecognized config line: "%s"' % (line))
-        line = ReadConfigLine (file)
+
+
+if __name__ == '__main__':
+    '''Test the iterato for reading configuration files'''
+    try:
+        fd = open(sys.argv[1])
+    except:
+        croak('Usage: %s <config-file>' % sys.argv[0])
+    
+    for line in ReadConfigLine(fd):
+        print line
        
--- a/47
+++ b/47
@ -20,6 +20,10 @@ Run it like this:

   git log -p -M [details] | gitdm [options]

+Alternatively, you can run with:
+
+   git log --numstat -M [details] | gitdm -n [options]
+
 The [details] tell git which changesets are of interest; the [options] can
 be:

@ -32,26 +36,35 @@ be:
 	   	By default, "./gitdm.config" is used.

 	-d	Omit the developer reports, giving employer information
-         	only. 
+         	only.

-	-D	Rather than create the usual statistics, create a 
-        file (datelc) providing lines changed per day, where the first column
-        displays the changes happened only on that day and the second sums
-        the day it happnened with the previous ones. This option is suitable
-        for feeding to a tool like gnuplot.
+	-D	Rather than create the usual statistics, create a file (datelc.csv)
+	    providing lines changed per day, where the first column displays
+	    the changes happened only on that day and the second sums the day it
+	    happnened with the previous ones. This option is suitable for
+	    feeding to a tool like gnuplot.

 	-h file	Generate HTML output to the given file

 	-l num	Only list the top <num> entries in each report.

+    -n  Use --numstat instead of generated patches to get the statistics.
+
 	-o file	Write text output to the given file (default is stdout).

+    -p prefix Dump out the database categorized by changeset and by file type.
+	    It requires -n, otherwise it is not possible to get separated results.
+
 	-r pat	Only generate statistics for changes to files whose 
 	   	name matches the given regular expression.

 	-s	Ignore Signed-off-by lines which match the author of 
 		each patch.

+	-t	Generate a report by type of contribution (code, documentation, etc.).
+	    It requires -n, otherwise this option is ignored silently.
+
+
 	-u 	Group all unknown developers under the "(Unknown)"
 	        employer. 

@ -68,6 +81,10 @@ looks like:
    git log -p -M v2.6.19..v2.6.20 | \
 	gitdm -u -s -a -o results -h results.html

+or:
+
+    git log --numstat -M v2.6.19..v2.6.20 | \
+	gitdm -u -s -a -n -o results -h results.html

 CONFIGURATION FILE

@ -134,6 +151,24 @@ end
 	for example, no check to ensure that the percentages add up to
    	something rational.

+FileTypeMap file
+
+	Map file names/extensions onto file types.  These files contain lines
+	like:
+
+		order <type1>,<type2>,...,<typeN>
+
+		filetype <type> <regex>
+		...
+
+	This construct allows fine graned reports by type of contribution
+	(build, code, image, multimedia, documentation, etc.)
+
+	Order is important because it is possible to have overlapping between
+	filenames.  For instance, ltmain.sh fits better as 'build' instead of
+	'code' (the filename instead of '\.sh$').  The first element in order
+	has precedence over the next ones.
+

 OTHER TOOLS

--- a/csv.py
+++ b/csv.py
@ -1,40 +0,0 @@
-#
-# aggregate per-month statistics for people
-#
-import sys, datetime
-
-class CSVStat:
-    def __init__ (self, name, employer, date):
-        self.name = name
-        self.employer = employer
-        self.added = self.removed = 0
-        self.date = date
-    def accumulate (self, p):
-        self.added = self.added + p.added
-        self.removed = self.removed + p.removed
-
-PeriodCommitHash = { }
-
-def AccumulatePatch (p, Aggregate):
-    date = "%.2d-%.2d-01"%(p.date.year, p.date.month)
-    if (Aggregate == 'week'):
-        date = "%.2d-%.2d"%(p.date.isocalendar()[0], p.date.isocalendar()[1])
-    authdatekey = "%s-%s"%(p.author.name, date)
-    if authdatekey not in PeriodCommitHash:
-        empl = p.author.emailemployer (p.email, p.date)
-        stat = CSVStat (p.author.name, empl, date)
-        PeriodCommitHash[authdatekey] = stat
-    else:
-        stat = PeriodCommitHash[authdatekey]
-    stat.accumulate (p)
-
-def OutputCSV (file):
-    if file is None:
-        return
-    file.write ("Name\tAffliation\tDate\tAdded\tRemoved\n")
-    for date, stat in PeriodCommitHash.items():
-        # sanitise names " is common and \" sometimes too
-        empl_name = stat.employer.name.replace ("\"", ".").replace ("\\", ".")
-        author_name = stat.name.replace ("\"", ".").replace ("\\", ".")
-        file.write ("\"%s\"\t\"%s\"\t%s\t%d\t%d\n"%(author_name, empl_name, stat.date, \
-                                                    stat.added, stat.removed))
--- a/csvdump.py
+++ b/csvdump.py
@ -0,0 +1,88 @@
+#
+# aggregate per-month statistics for people
+#
+import sys, datetime
+import csv
+
+class CSVStat:
+    def __init__ (self, name, email, employer, date):
+        self.name = name
+        self.email = email
+        self.employer = employer
+        self.added = self.removed = 0
+        self.date = date
+    def accumulate (self, p):
+        self.added = self.added + p.added
+        self.removed = self.removed + p.removed
+
+PeriodCommitHash = { }
+
+def AccumulatePatch (p, Aggregate):
+    date = "%.2d-%.2d-01"%(p.date.year, p.date.month)
+    if (Aggregate == 'week'):
+        date = "%.2d-%.2d"%(p.date.isocalendar()[0], p.date.isocalendar()[1])
+    authdatekey = "%s-%s"%(p.author.name, date)
+    if authdatekey not in PeriodCommitHash:
+        empl = p.author.emailemployer (p.email, p.date)
+        stat = CSVStat (p.author.name, p.email, empl, date)
+        PeriodCommitHash[authdatekey] = stat
+    else:
+        stat = PeriodCommitHash[authdatekey]
+    stat.accumulate (p)
+
+ChangeSets = []
+FileTypes = []
+
+def store_patch(patch):
+    if not patch.merge:
+        employer = patch.author.emailemployer(patch.email, patch.date)
+        employer = employer.name.replace('"', '.').replace ('\\', '.')
+        author = patch.author.name.replace ('"', '.').replace ('\\', '.')
+        author = patch.author.name.replace ("'", '.')
+        try:
+            domain = patch.email.split('@')[1]
+        except:
+            domain = patch.email
+        ChangeSets.append([patch.commit, str(patch.date),
+                           patch.email, domain, author, employer,
+                           patch.added, patch.removed])
+        for (filetype, (added, removed)) in patch.filetypes.iteritems():
+            FileTypes.append([patch.commit, filetype, added, removed])
+
+
+def save_csv (prefix='data'):
+    # Dump the ChangeSets
+    if len(ChangeSets) > 0:
+        fd = open('%s-changesets.csv' % prefix, 'w')
+        writer = csv.writer (fd, quoting=csv.QUOTE_NONNUMERIC)
+        writer.writerow (['Commit', 'Date', 'Domain',
+                          'Email', 'Name', 'Affliation',
+                          'Added', 'Removed'])
+        for commit in ChangeSets:
+            writer.writerow(commit)
+
+    # Dump the file types
+    if len(FileTypes) > 0:
+        fd = open('%s-filetypes.csv' % prefix, 'w')
+        writer = csv.writer (fd, quoting=csv.QUOTE_NONNUMERIC)
+
+        writer.writerow (['Commit', 'Type', 'Added', 'Removed'])
+        for commit in FileTypes:
+            writer.writerow(commit)
+
+
+
+def OutputCSV (file):
+    if file is None:
+        return
+    writer = csv.writer (file, quoting=csv.QUOTE_NONNUMERIC)
+    writer.writerow (['Name', 'Email', 'Affliation', 'Date',
+                      'Added', 'Removed'])
+    for date, stat in PeriodCommitHash.items():
+        # sanitise names " is common and \" sometimes too
+        empl_name = stat.employer.name.replace ('"', '.').replace ('\\', '.')
+        author_name = stat.name.replace ('"', '.').replace ('\\', '.')
+        writer.writerow ([author_name, stat.email, empl_name, stat.date,
+                          stat.added, stat.removed])
+
+__all__ = [  'AccumulatePatch', 'OutputCSV', 'store_patch' ]
--- a/database.py
+++ b/database.py
@ -188,6 +188,25 @@ class VirtualEmployer (Employer):
            # Should check that they add up too, but I'm lazy
        Employers[self.name] = self

+class FileType:
+    def __init__ (self, patterns={}, order=[]):
+        self.patterns = patterns
+        self.order = order
+
+    def guess_file_type (self, filename, patterns=None, order=None):
+        patterns = patterns or self.patterns
+        order = order or self.order
+
+        for file_type in order:
+            if patterns.has_key (file_type):
+                for patt in patterns[file_type]:
+                    if patt.search (filename):
+                        return file_type
+
+        return 'unknown'
+
+FileTypes = None
+
 #
 # Mix all the virtual employers into their real destinations.
 #
--- a/195
+++ b/195
@ -1,4 +1,5 @@
 #!/usr/bin/pypy
+#-*- coding:utf-8 -*-
 #

 #
@ -6,15 +7,17 @@
 #
 # Copyright 2007-11 Eklektix, Inc.
 # Copyright 2007-11 Jonathan Corbet <corbet@lwn.net>
+# Copyright 2011 Germán Póo-Caamaño <gpoo@gnome.org>
 #
 # This file may be distributed under the terms of the GNU General
 # Public License, version 2.


-import database, csv, ConfigFile, reports
+import database, csvdump, ConfigFile, reports
 import getopt, datetime
 import os, re, sys, rfc822, string
-from patterns import *
+import logparser
+from patterns import patterns

 Today = datetime.date.today()

@ -32,11 +35,14 @@ DateStats = 0
 AuthorSOBs = 1
 FileFilter = None
 CSVFile = None
+CSVPrefix = None
 AkpmOverLt = 0
 DumpDB = 0
 CFName = 'gitdm.config'
 DirName = ''
 Aggregate = 'month'
+Numstat = 0
+ReportByFileType = 0

 #
 # Options:
@ -48,7 +54,9 @@ Aggregate = 'month'
 # -D		Output date statistics
 # -h hfile	HTML output to hfile
 # -l count	Maximum length for output lists
+# -n        Use numstats instead of generated patch from git log
 # -o file	File for text output
+# -p prefix Prefix for CSV output
 # -r pattern	Restrict to files matching pattern
 # -s		Ignore author SOB lines
 # -u		Map unknown employers to '(Unknown)'
@ -59,9 +67,10 @@ Aggregate = 'month'
 def ParseOpts ():
    global MapUnknown, DevReports
    global DateStats, AuthorSOBs, FileFilter, AkpmOverLt, DumpDB
-    global CFName, CSVFile, DirName, Aggregate
+    global CFName, CSVFile, CSVPrefix,DirName, Aggregate, Numstat
+    global ReportByFileType

-    opts, rest = getopt.getopt (sys.argv[1:], 'ab:dc:Dh:l:o:r:suwx:z')
+    opts, rest = getopt.getopt (sys.argv[1:], 'ab:dc:Dh:l:no:p:r:stuwx:z')
    for opt in opts:
        if opt[0] == '-a':
            AkpmOverLt = 1
@ -77,13 +86,19 @@ def ParseOpts ():
            reports.SetHTMLOutput (open (opt[1], 'w'))
        elif opt[0] == '-l':
            reports.SetMaxList (int (opt[1]))
+        elif opt[0] == '-n':
+            Numstat = 1
        elif opt[0] == '-o':
            reports.SetOutput (open (opt[1], 'w'))
+        elif opt[0] == '-p':
+            CSVPrefix = opt[1]
        elif opt[0] == '-r':
            print 'Filter on "%s"' % (opt[1])
            FileFilter = re.compile (opt[1])
        elif opt[0] == '-s':
            AuthorSOBs = 0
+        elif opt[0] == '-t':
+            ReportByFileType = 1
        elif opt[0] == '-u':
            MapUnknown = 1
        elif opt[0] == '-x':
@ -139,6 +154,8 @@ def PrintDateStats():
 # Let's slowly try to move some smarts into this class.
 #
 class patch:
+    (ADDED, REMOVED) = range (2)
+
    def __init__ (self, commit):
        self.commit = commit
        self.merge = self.added = self.removed = 0
@ -148,6 +165,7 @@ class patch:
        self.reviews = [ ]
        self.testers = [ ]
        self.reports = [ ]
+        self.filetypes = {}

    def addreviewer (self, reviewer):
        self.reviews.append (reviewer)
@ -157,36 +175,57 @@ class patch:

    def addreporter (self, reporter):
        self.reports.append (reporter)
+
+    def addfiletype (self, filetype, added, removed):
+        if self.filetypes.has_key (filetype):
+            self.filetypes[filetype][self.ADDED] += added
+            self.filetypes[filetype][self.REMOVED] += removed
+        else:
+            self.filetypes[filetype] = [added, removed]
+
+def parse_numstat(line, file_filter):
+    """
+        Receive a line of text, determine if fits a numstat line and
+        parse the added and removed lines as well as the file type.
+    """
+    m = patterns['numstat'].match (line)
+    if m:
+        filename = m.group (3)
+        # If we have a file filter, check for file lines.
+        if file_filter and not file_filter.search (filename):
+            return None, None, None, None
+
+        try:
+            added = int (m.group (1))
+            removed = int (m.group (2))
+        except ValueError:
+            # A binary file (image, etc.) is marked with '-'
+            added = removed = 0
+
+        m = patterns['rename'].match (filename)
+        if m:
+            filename = '%s%s%s' % (m.group (1), m.group (3), m.group (4))
+
+        filetype = database.FileTypes.guess_file_type (os.path.basename(filename))
+        return filename, filetype, added, removed
+    else:
+        return None, None, None, None
+
 #
 # The core hack for grabbing the information about a changeset.
 #
-def grabpatch():
-    global NextLine
-    
-    while (1):
-        m = Pcommit.match (NextLine)
-        if m:
-            break;
-        NextLine = sys.stdin.readline ()
-        if not NextLine:
-            return
+def grabpatch(logpatch):
+    m = patterns['commit'].match (logpatch[0])
+    if not m:
+        return None

    p = patch(m.group (1))
-    NextLine = sys.stdin.readline ()
    ignore = (FileFilter is not None)
-    while NextLine:
-        Line = NextLine
-        #
-        # If this line starts a new commit, drop out.
-        #
-        m = Pcommit.match (Line)
-        if m:
-            break
-        NextLine = sys.stdin.readline ()
+    for Line in logpatch[1:]:
        #
        # Maybe it's an author line?
        #
-        m = Pauthor.match (Line)
+        m = patterns['author'].match (Line)
        if m:
            p.email = database.RemapEmail (m.group (2))
            p.author = LookupStoreHacker(m.group (1), p.email)
@ -194,7 +233,7 @@ def grabpatch():
        #
        # Could be a signed-off-by:
        #
-        m = Psob.match (Line)
+        m = patterns['signed-off-by'].match (Line)
        if m:
            email = database.RemapEmail (m.group (2))
            sobber = LookupStoreHacker(m.group (1), email)
@ -204,24 +243,26 @@ def grabpatch():
        #
        # Various other tags of interest.
        #
-        m = Preview.match (Line)  # Reviewed-by:
+        m = patterns['reviewed-by'].match (Line)
        if m:
            email = database.RemapEmail (m.group (2))
            p.addreviewer (LookupStoreHacker(m.group (1), email))
            continue
-        m = Ptest.match (Line)    # Tested-by:
+        m = patterns['tested-by'].match (Line)
        if m:
            email = database.RemapEmail (m.group (2))
            p.addtester (LookupStoreHacker (m.group (1), email))
            p.author.testcredit (patch)
            continue
-        m = Prep.match (Line)     # Reported-by:
+        # Reported-by:
+        m = patterns['reported-by'].match (Line)
        if m:
            email = database.RemapEmail (m.group (2))
            p.addreporter (LookupStoreHacker (m.group (1), email))
            p.author.reportcredit (patch)
            continue
-        m = Preptest.match (Line)  # Reported-and-tested-by:
+        # Reported-and-tested-by:
+        m = patterns['reported-and-tested-by'].match (Line)
        if m:
            email = database.RemapEmail (m.group (2))
            h = LookupStoreHacker (m.group (1), email)
@ -233,14 +274,14 @@ def grabpatch():
        #
        # If this one is a merge, make note of the fact.
        #
-        m = Pmerge.match (Line)
+        m = patterns['merge'].match (Line)
        if m:
            p.merge = 1
            continue
        #
        # See if it's the date.
        #
-        m = Pdate.match (Line)
+        m = patterns['date'].match (Line)
        if m:
            dt = rfc822.parsedate(m.group (2))
            p.date = datetime.date (dt[0], dt[1], dt[2])
@ -248,20 +289,29 @@ def grabpatch():
                sys.stderr.write ('Funky date: %s\n' % p.date)
                p.date = Today
            continue
-        #
-        # If we have a file filter, check for file lines.
-        #
-        if FileFilter:
-            ignore = ApplyFileFilter (Line, ignore)
-        #
-        # OK, maybe it's part of the diff itself.
-        #
-        if not ignore:
-            if Padd.match (Line):
-                p.added += 1
-                continue
-            if Prem.match (Line):
-                p.removed += 1
+        if not Numstat:
+            #
+            # If we have a file filter, check for file lines.
+            #
+            if FileFilter:
+                ignore = ApplyFileFilter (Line, ignore)
+            #
+            # OK, maybe it's part of the diff itself.
+            #
+            if not ignore:
+                if patterns['add'].match (Line):
+                    p.added += 1
+                    continue
+                if patterns['rem'].match (Line):
+                    p.removed += 1
+        else:
+            # Get the statistics (lines added/removes) using numstats
+            # and without requiring a diff (--numstat instead -p)
+			(filename, filetype, added, removed) = parse_numstat (Line, FileFilter)
+			if filename:
+			    p.added += added
+			    p.removed += removed
+			    p.addfiletype (filetype, added, removed)

    if '@' in p.author.name:
        GripeAboutAuthorName (p.author.name)
@ -279,7 +329,7 @@ def ApplyFileFilter (line, ignore):
    # If this is the first file line (--- a/), set ignore one way
    # or the other.
    #
-    m = Pfilea.match (line)
+    m = patterns['filea'].match (line)
    if m:
        file = m.group (1)
        if FileFilter.search (file):
@ -288,13 +338,29 @@ def ApplyFileFilter (line, ignore):
    #
    # For the second line, we can turn ignore off, but not on
    #
-    m = Pfileb.match (line)
+    m = patterns['fileb'].match (line)
    if m:
        file = m.group (1)
        if FileFilter.search (file):
            return 0
    return ignore

+def is_svntag(logpatch):
+    """
+        This is a workaround for a bug on the migration to Git
+        from Subversion found in GNOME.  It may happen in other
+        repositories as well.
+    """
+
+    for Line in logpatch:
+        m = patterns['svn-tag'].match(Line.strip())
+        if m:
+            sys.stderr.write ('(W) detected a commit on a svn tag: %s\n' %
+                              (m.group (0),))
+            return True
+
+    return False
+
 #
 # If this patch is signed off by both Andrew Morton and Linus Torvalds,
 # remove the (redundant) Linus signoff.
@ -324,7 +390,6 @@ if AkpmOverLt == 1:
    Akpm = ('akpm@linux-foundation.org',
        LookupStoreHacker ('Andrew Morton', 'akpm@linux-foundation.org'))

-NextLine = sys.stdin.readline ()
 TotalChanged = TotalAdded = TotalRemoved = 0

 #
@ -332,12 +397,23 @@ TotalChanged = TotalAdded = TotalRemoved = 0
 #
 print >> sys.stderr, 'Grabbing changesets...\r',

+patches = logparser.LogPatchSplitter(sys.stdin)
 printcount = CSCount = 0
-while (1):
+
+for logpatch in patches:
    if (printcount % 50) == 0:
        print >> sys.stderr, 'Grabbing changesets...%d\r' % printcount,
    printcount += 1
-    p = grabpatch()
+
+    # We want to ignore commits on svn tags since in Subversion
+    # thats mean a copy of the whole repository, which leads to
+    # wrong results.  Some migrations from Subversion to Git does
+    # not catch all this tags/copy and import them just as a new
+    # big changeset.
+    if is_svntag(logpatch):
+        continue
+
+    p = grabpatch(logpatch)
    if not p:
        break
 #    if p.added > 100000 or p.removed > 100000:
@ -373,8 +449,9 @@ while (1):
            hacker.addtested (p)
        for hacker in p.reports:
            hacker.addreport (p)
-    CSCount += 1
-    csv.AccumulatePatch (p, Aggregate)
+        CSCount += 1
+    csvdump.AccumulatePatch (p, Aggregate)
+    csvdump.store_patch (p)
 print >> sys.stderr, 'Grabbing changesets...done       '

 if DumpDB:
@ -403,10 +480,16 @@ if TotalChanged == 0:
 if DateStats:
    PrintDateStats ()

-csv.OutputCSV (CSVFile)
-if CSVFile is not None:
-        CSVFile.close ()
+if CSVPrefix:
+    csvdump.save_csv (CSVPrefix)
+
+if CSVFile:
+    csvdump.OutputCSV (CSVFile)
+    CSVFile.close ()

 if DevReports:
    reports.DevReports (hlist, TotalChanged, CSCount, TotalRemoved)
 reports.EmplReports (elist, TotalChanged, CSCount)
+
+if ReportByFileType and Numstat:
+    reports.ReportByFileType (hlist)
--- a/gitdm.config
+++ b/gitdm.config
@ -20,3 +20,8 @@ EmailMap sample-config/domain-map
 #
 # GroupMap sample-config/illuminati  The Illuminati
 #
+#
+# Use FileTypeMap to map a file types to file names using regular
+# regular expressions.
+#
+FileTypeMap sample-config/filetypes.txt
--- a/logparser.py
+++ b/logparser.py
@ -0,0 +1,90 @@
+#!/usr/bin/env python
+#-*- coding:utf-8 -*-
+#
+# Copyright © 2009 Germán Póo-Caamaño <gpoo@gnome.org>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Library General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+
+import sys
+from patterns import patterns
+
+class LogPatchSplitter:
+    """
+        LogPatchSplitters provides a iterator to extract every
+        changeset from a git log output.
+
+        Typical use case:
+
+            patches = LogPatchSplitter(sys.stdin)
+
+            for patch in patches:
+                parse_patch(patch)
+    """
+
+    def __init__(self, fd):
+        self.fd = fd
+        self.buffer = None
+        self.patch = []
+
+    def __iter__(self):
+        return self
+
+    def next(self):
+        patch = self.__grab_patch__()
+        if not patch:
+            raise StopIteration
+        return patch
+
+    def __grab_patch__(self):
+        """
+            Extract a patch from the file descriptor and the
+            patch is returned as a list of lines.
+        """
+
+        patch = []
+        line = self.buffer or self.fd.readline()
+
+        while line:
+            m = patterns['commit'].match(line)
+            if m:
+                patch = [line]
+                break
+            line = self.fd.readline()
+
+        if not line:
+            return None
+
+        line = self.fd.readline()
+        while line:
+            # If this line starts a new commit, drop out.
+            m = patterns['commit'].match(line)
+            if m:
+                self.buffer = line
+                break
+
+            patch.append(line)
+            self.buffer = None
+            line = self.fd.readline()
+
+        return patch
+
+
+if __name__ == '__main__':
+    patches = LogPatchSplitter(sys.stdin)
+
+    for patch in patches:
+        print '---------- NEW PATCH ----------'
+        for line in patch:
+            print line,
--- a/patterns.py
+++ b/patterns.py
@ -1,10 +1,12 @@
 #
+# -*- coding:utf-8 -*-
 # Pull together regular expressions used in multiple places.
 #
 # This code is part of the LWN git data miner.
 #
 # Copyright 2007-11 Eklektix, Inc.
 # Copyright 2007-11 Jonathan Corbet <corbet@lwn.net>
+# Copyright 2011 Germán Póo-Caamaño <gpoo@gnome.org>
 #
 # This file may be distributed under the terms of the GNU General
 # Public License, version 2.
@ -16,24 +18,34 @@ import re
 # expressions." Now they have two problems.
 #    -- Jamie Zawinski
 #
-Pemail = r'\s+"?([^<"]+)"?\s<([^>]+)>' # just email addr + name
-Pcommit = re.compile (r'^commit ([0-9a-f ]+)$')
-Pauthor = re.compile (r'^Author:' + Pemail + '$')
-Psob = re.compile (r'^\s+Signed-off-by:' + Pemail + '.*$')
-Pmerge = re.compile (r'^Merge:.*$')
-Padd = re.compile (r'^\+[^+].*$')
-Prem = re.compile (r'^-[^-].*$')
-Pdate = re.compile (r'^(Commit)?Date:\s+(.*)$')
-Pfilea = re.compile (r'^---\s+(.*)$')
-Pfileb = re.compile (r'^\+\+\+\s+(.*)$')
-Preview = re.compile (r'^\s+Reviewed-by:' + Pemail + '.*$')
-Ptest = re.compile (r'^\s+tested-by:' + Pemail + '.*$', re.I)
-Prep = re.compile (r'^\s+Reported-by:' + Pemail + '.*$')
-Preptest = re.compile (r'^\s+reported-and-tested-by:' + Pemail + '.*$', re.I)
-#
-# Merges are described with a variety of lines.
-#
-PExtMerge = re.compile(r'^ +Merge( branch .* of)? ([^ ]+:[^ ]+)\n$')
-PIntMerge = re.compile(r'^ +(Merge|Pull) .* into .*$')
-# PIntMerge2 = re.compile(r"^ +Merge branch(es)? '.*$")
-PIntMerge2 = re.compile(r"^ +Merge .*$")
+_pemail = r'\s+"?([^<"]+)"?\s<([^>]+)>' # just email addr + name
+
+patterns = {
+    'commit': re.compile (r'^commit ([0-9a-f ]+)$'),
+    'author': re.compile (r'^Author:' + _pemail + '$'),
+    'signed-off-by': re.compile (r'^\s+Signed-off-by:' + _pemail + '.*$'),
+    'merge': re.compile (r'^Merge:.*$'),
+    'add': re.compile (r'^\+[^+].*$'),
+    'rem': re.compile (r'^-[^-].*$'),
+    'date': re.compile (r'^(Commit)?Date:\s+(.*)$'),
+    # filea, fileb are used only in 'parche mode' (-p)
+    'filea': re.compile (r'^---\s+(.*)$'),
+    'fileb': re.compile (r'^\+\+\+\s+(.*)$'),
+    'reviewed-by': re.compile (r'^\s+Reviewed-by:' + _pemail+ '.*$'),
+    'tested-by': re.compile (r'^\s+tested-by:' + _pemail + '.*$', re.I),
+    'reported-by': re.compile (r'^\s+Reported-by:' + _pemail + '.*$'),
+    'reported-and-tested-by': re.compile (r'^\s+reported-and-tested-by:' + _pemail + '.*$', re.I),
+    #
+    # Merges are described with a variety of lines.
+    #
+    'ExtMerge': re.compile(r'^ +Merge( branch .* of)? ([^ ]+:[^ ]+)\n$'),
+    'IntMerge': re.compile(r'^ +(Merge|Pull) .* into .*$'),
+    # PIntMerge2 = re.compile(r"^ +Merge branch(es)? '.*$"),
+    'IntMerge2': re.compile(r"^ +Merge .*$"),
+    # Another way to get the statistics (per file).
+    # It implies --numstat
+    'numstat': re.compile('^(\d+|-)\s+(\d+|-)\s+(.*)$'),
+    'rename' : re.compile('(.*)\{(.*) => (.*)\}(.*)'),
+    # Detect errors on svn conversions
+    'svn-tag': re.compile("^svn path=/tags/(.*)/?; revision=([0-9]+)$"),
+}
--- a/reports.py
+++ b/reports.py
@ -340,4 +340,46 @@ def EmplReports (elist, totalchanged, cscount):
    ReportByELChanged (elist, totalchanged)
    ReportByESOBs (elist)
    ReportByEHackers (elist)
-    
+
+def ReportByFileType (hacker_list):
+    total = {}
+    total_by_hacker = {}
+
+    BeginReport ('Developer contributions by type')
+    for h in hacker_list:
+        by_hacker = {}
+        for patch in h.patches:
+            # Get a summary by hacker
+            for (filetype, (added, removed)) in patch.filetypes.iteritems():
+                if by_hacker.has_key(filetype):
+                    by_hacker[filetype][patch.ADDED] += added
+                    by_hacker[filetype][patch.REMOVED] += removed
+                else:
+                    by_hacker[filetype] = [added, removed]
+
+                # Update the totals
+                if total.has_key(filetype):
+                    total[filetype][patch.ADDED] += added
+                    total[filetype][patch.REMOVED] += removed
+                else:
+                    total[filetype] = [added, removed, []]
+
+        # Print a summary by hacker
+        print h.name
+        for filetype, counters in by_hacker.iteritems():
+            print '\t', filetype, counters
+            h_added = by_hacker[filetype][patch.ADDED]
+            h_removed = by_hacker[filetype][patch.REMOVED]
+            total[filetype][2].append ([h.name, h_added, h_removed])
+
+    # Print the global summary
+    BeginReport ('Contributions by type and developers')
+    for filetype, (added, removed, hackers) in total.iteritems():
+        print filetype, added, removed
+        for h, h_added, h_removed in hackers:
+            print '\t%s: [%d, %d]' % (h, h_added, h_removed)
+
+    # Print the very global summary
+    BeginReport ('General contributions by type')
+    for filetype, (added, removed, hackers) in total.iteritems():
+        print filetype, added, removed
--- a/sample-config/filetypes.txt
+++ b/sample-config/filetypes.txt
@ -0,0 +1,362 @@
+# -*- coding:utf-8 -*-
+# Copyright (C)  2006 Libresoft
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option  any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Library General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+#
+# Authors : Gregorio Robles <grex@gsyc.escet.urjc.es>
+# Authors : Germán Póo-Caamaño <gpoo@gnome.org>
+#
+# This file contains associations parameters regarding filetypes
+# (documentation, develompent, multimedia, images...)
+#
+# format:
+# filetype <type> <regex> [<comment>]
+#
+# Order:
+#   The list should keep an order, so filetypes can be counted properly.
+#   ie. we want ltmain.sh -> 'build' instead of 'code'.
+#
+#   If there is an filetype which is not in order but has values, it will
+#   be added at the end.
+#
+order image,translation,ui,multimedia,package,build,code,documentation,devel-doc
+#
+#
+# Code files (headers and the like included
+# (most common languages first
+#
+filetype code \.c$	# C
+filetype code \.pc$	# C
+filetype code \.ec$	# C
+filetype code \.ecp$	# C
+filetype code \.C$	# C++
+filetype code \.cpp$	# C++
+filetype code \.c\+\+$	# C++
+filetype code \.cxx$	# C++
+filetype code \.cc$	# C++
+filetype code \.pcc$	# C++
+filetype code \.cpy$	# C++
+filetype code \.h$	# C or C++ header
+filetype code \.hh$	# C++ header
+filetype code \.hpp$	# C++ header
+filetype code \.hxx$	# C++ header
+filetype code \.sh$	# Shell
+filetype code \.pl$	# Perl
+filetype code \.pm$	# Perl
+filetype code \.pod$	# Perl
+filetype code \.perl$	# Perl
+filetype code \.cgi$	# CGI
+filetype code \.php$	# PHP
+filetype code \.php3$	# PHP
+filetype code \.php4$	# PHP
+filetype code \.inc$	# PHP
+filetype code \.py$	# Python
+filetype code \.java$	# Java
+filetype code \.class$	# Java Class (or at least a class in some OOPL
+filetype code \.ada$	# ADA
+filetype code \.ads$	# ADA
+filetype code \.adb$	# ADA
+filetype code \.pad$	# ADA
+filetype code \.s$	# Assembly
+filetype code \.S$	# Assembly
+filetype code \.asm$	# Assembly
+filetype code \.awk$	# awk
+filetype code \.cs$	# C#
+filetype code \.csh$	# CShell (including tcsh
+filetype code \.cob$	# COBOL
+filetype code \.cbl$	# COBOL
+filetype code \.COB$	# COBOL
+filetype code \.CBL$	# COBOL
+filetype code \.exp$	# Expect
+filetype code \.l$	# (F lex
+filetype code \.ll$	# (F lex
+filetype code \.lex$	# (F lex
+filetype code \.f$	# Fortran
+filetype code \.f77$	# Fortran
+filetype code \.F$	# Fortran
+filetype code \.hs$	# Haskell
+filetype code \.lhs$	# Not preprocessed Haskell
+filetype code \.el$	# LISP (including Scheme
+filetype code \.scm$	# LISP (including Scheme
+filetype code \.lsp$	# LISP (including Scheme
+filetype code \.jl$	# LISP (including Scheme
+filetype code \.ml$	# ML
+filetype code \.ml3$	# ML
+filetype code \.m3$	# Modula3
+filetype code \.i3$	# Modula3
+filetype code \.m$	# Objective-C
+filetype code \.p$	# Pascal
+filetype code \.pas$	# Pascal
+filetype code \.rb$	# Ruby
+filetype code \.sed$	# sed
+filetype code \.tcl$	# TCL
+filetype code \.tk$	# TCL
+filetype code \.itk$	# TCL
+filetype code \.y$	# Yacc
+filetype code \.yy$	# Yacc
+filetype code \.idl$	# CORBA IDL
+filetype code \.gnorba$	# GNOME CORBA IDL
+filetype code \.oafinfo$	# GNOME OAF
+filetype code \.mcopclass$	# MCOP IDL compiler generated class
+filetype code \.autoforms$	# Autoform
+filetype code \.atf$	# Autoform
+filetype code \.gnuplot$
+filetype code \.xs$	# Shared library? Seen a lot of them in gnome-perl
+filetype code \.js$	# JavaScript (and who knows, maybe more
+filetype code \.patch$
+filetype code \.diff$	# Sometimes patches appear this way
+filetype code \.ids$	# Not really sure what this means
+filetype code \.upd$	# ¿¿¿??? (from Kcontrol
+filetype code $.ad$ 	# ¿¿¿??? (from Kdisplay and mc
+filetype code $.i$	# Appears in the kbindings for Qt
+filetype code $.pri$	# from Qt
+filetype code \.schema$	# Not really sure what this means
+filetype code \.fd$	# Something to do with latex
+filetype code \.cls$	# Something to do with latex
+filetype code \.pro$	# Postscript generation
+filetype code \.ppd$	# PDF generation
+filetype code \.dlg$	# Not really sure what this means
+filetype code \.plugin$	# Plug-in file
+filetype code \.dsp	# Microsoft Developer Studio Project File
+filetype code \.vim$	# vim syntax file
+filetype code \.trm$	# gnuplot term file
+filetype code \.font$	# Font mapping
+filetype code \.ccg$	# C++ files - Found in gtkmm*
+filetype code \.hg$	# C++ headers - Found in gtkmm*
+filetype code \.dtd	# XML Document Type Definition
+filetype code \.bat	# DOS batch files
+filetype code \.vala	# Vala
+filetype code \.py\.in$
+filetype code \.rhtml$	# eRuby
+filetype code \.sql$	# SQL script
+#
+#
+# Development documentation files (for hacking generally
+#
+filetype devel-doc ^readme.*$
+filetype devel-doc ^changelog.*
+filetype devel-doc ^todo.*$
+filetype devel-doc ^credits.*$
+filetype devel-doc ^authors.*$
+filetype devel-doc ^changes.*$
+filetype devel-doc ^news.*$
+filetype devel-doc ^install.*$
+filetype devel-doc ^hacking.*$
+filetype devel-doc ^copyright.*$
+filetype devel-doc ^licen(s|c)e.*$
+filetype devel-doc ^copying.*$
+filetype devel-doc manifest$
+filetype devel-doc faq$
+filetype devel-doc building$
+filetype devel-doc howto$
+filetype devel-doc design$
+filetype devel-doc \.files$
+filetype devel-doc files$
+filetype devel-doc subdirs$
+filetype devel-doc maintainers$
+filetype devel-doc developers$
+filetype devel-doc contributors$
+filetype devel-doc thanks$
+filetype devel-doc releasing$
+filetype devel-doc test$
+filetype devel-doc testing$
+filetype devel-doc build$
+filetype devel-doc comments?$
+filetype devel-doc bugs$
+filetype devel-doc buglist$
+filetype devel-doc problems$
+filetype devel-doc debug$
+filetype devel-doc hacks$
+filetype devel-doc hacking$
+filetype devel-doc versions?$
+filetype devel-doc mappings$
+filetype devel-doc tips$
+filetype devel-doc ideas?$
+filetype devel-doc spec$
+filetype devel-doc compiling$
+filetype devel-doc notes$
+filetype devel-doc missing$
+filetype devel-doc done$
+filetype devel-doc \.omf$	# XML-based format used in GNOME
+filetype devel-doc \.lsm$
+filetype devel-doc ^doxyfile$
+filetype devel-doc \.kdevprj$
+filetype devel-doc \.directory$
+filetype devel-doc \.dox$
+filetype devel-doc \.doap$
+#
+#
+# Building, compiling, configuration and CVS admin files
+#
+filetype build \.in.*$
+filetype build configure.*$
+filetype build makefile.*$
+filetype build config\.sub$
+filetype build config\.guess$
+filetype build config\.status$
+filetype build ltmain\.sh$
+filetype build autogen\.sh$
+filetype build config$
+filetype build conf$
+filetype build cvsignore$
+filetype build \.cfg$
+filetype build \.m4$
+filetype build \.mk$
+filetype build \.mak$
+filetype build \.make$
+filetype build \.mbx$
+filetype build \.protocol$
+filetype build \.version$
+filetype build mkinstalldirs$
+filetype build install-sh$
+filetype build rules$
+filetype build \.kdelnk$
+filetype build \.menu$
+filetype build linguas$	# Build translations
+filetype build potfiles.*$	# Build translations
+filetype build \.shlibs$	# Shared libraries
+# filetype build %debian%
+# filetype build %specs/%
+filetype build \.spec$	# It seems theyre necessary for RPM build
+filetype build \.def$	# build bootstrap for DLLs on win32
+#
+#
+# Documentation files
+#
+# filetype documentation doc/%
+# filetype documentation %HOWTO%
+filetype documentation \.html$
+filetype documentation \.txt$
+filetype documentation \.ps(\.gz|\.bz2)?$
+filetype documentation \.dvi(\.gz|\.bz2)?$
+filetype documentation \.lyx$
+filetype documentation \.tex$
+filetype documentation \.texi$
+filetype documentation \.pdf(\.gz|\.bz2)?$
+filetype documentation \.djvu$
+filetype documentation \.epub$
+filetype documentation \.sgml$
+filetype documentation \.docbook$
+filetype documentation \.wml$
+filetype documentation \.xhtml$
+filetype documentation \.phtml$
+filetype documentation \.shtml$
+filetype documentation \.htm$
+filetype documentation \.rdf$
+filetype documentation \.phtm$
+filetype documentation \.tmpl$
+filetype documentation \.ref$	# References
+filetype documentation \.css$
+# filetype documentation %tutorial%
+filetype documentation \.templates$
+filetype documentation \.dsl$
+filetype documentation \.ent$
+filetype documentation \.xml$
+filetype documentation \.xmi$
+filetype documentation \.xsl$
+filetype documentation \.entities$
+filetype documentation \.[1-7]$	# Man pages
+filetype documentation \.man$
+filetype documentation \.manpages$
+filetype documentation \.doc$
+filetype documentation \.rtf$
+filetype documentation \.wpd$
+filetype documentation \.qt3$
+filetype documentation man\d?/.*\.\d$
+filetype documentation \.docs$
+filetype documentation \.sdw$	# OpenOffice.org Writer document
+filetype documentation \.odt$	# OpenOffice.org document
+filetype documentation \.en$	# Files in English language
+filetype documentation \.de$	# Files in German
+filetype documentation \.es$	# Files in Spanish
+filetype documentation \.fr$	# Files in French
+filetype documentation \.it$	# Files in Italian
+filetype documentation \.cz$	# Files in Czech
+filetype documentation \.page$	# Mallard
+filetype documentation \.page.stub$	# Mallard stub
+#
+#
+# Images
+#
+filetype image \.png$
+filetype image \.jpg$
+filetype image \.jpeg$
+filetype image \.bmp$
+filetype image \.gif$
+filetype image \.xbm$
+filetype image \.eps$
+filetype image \.mng$
+filetype image \.pnm$
+filetype image \.pbm$
+filetype image \.ppm$
+filetype image \.pgm$
+filetype image \.gbr$
+filetype image \.svg$
+filetype image \.fig$
+filetype image \.tif$
+filetype image \.swf$
+filetype image \.svgz$
+filetype image \.shape$	# XML files used for shapes for instance in Kivio
+filetype image \.sml$	# XML files used for shapes for instance in Kivio
+filetype image \.bdf$	#  vfontcap  - Vector Font Capability Database (VFlib Version 2
+filetype image \.ico$
+filetype image \.dia$	# We consider .dia as images, I dont want them in unknown
+#
+#
+# Translation files
+#
+filetype translation \.po$
+filetype translation \.pot$
+filetype translation \.charset$
+filetype translation \.mo$
+#
+#
+# User interface files
+#
+filetype ui \.desktop$
+filetype ui \.ui$
+filetype ui \.xpm$
+filetype ui \.xcf$
+filetype ui \.3ds$
+filetype ui \.theme$
+filetype ui \.kimap$
+filetype ui \.glade$
+filetype ui \.gtkbuilder$
+filetype ui rc$
+#
+#
+# Sound files
+#
+filetype multimedia \.mp3$
+filetype multimedia \.ogg$
+filetype multimedia \.wav$
+filetype multimedia \.au$
+filetype multimedia \.mid$
+filetype multimedia \.vorbis$
+filetype multimedia \.midi$
+filetype multimedia \.arts$
+#
+#
+# Packages (yes, there are people who upload packages to the repo)
+#
+filetype package \.tar$
+filetype package \.tar.gz$
+filetype package \.tar.bz2$
+filetype package \.tar.xz$
+filetype package \.tgz$
+filetype package \.deb$
+filetype package \.rpm$
+filetype package \.srpm$
+filetype package \.ebuild$