lodgeit/lodgeit/lib/antispam.py

49 lines
1.1 KiB
Python

# -*- coding: utf-8 -*-
"""
lodgeit.lib.antispam
~~~~~~~~~~~~~~~~~~~~
Fight stupid spammers.
:copyright: 2007-2008 by Armin Ronacher, Christopher Grebs.
:license: BSD
"""
from __future__ import division
import re
from operator import sub
from itertools import starmap
_url_pattern = (
r'(?:(?:https?|ftps?|file|ssh|mms|irc|rsync|smb)://|'
r'(?:mailto|telnet|s?news|sips?|skype):)'
)
_link_re = re.compile(r'''(?xi)
(%(url)s[^\s\'"]+)|
(<a(?:\s+\w+\s*=\s*
(?:"\s*%(url)s.*?"|'\s*%(url)s.*?'|%(url)s.*?)
)+\s*>.*?</a>)
''' % dict(url=_url_pattern))
# maximum number of links in percent
MAX_LINK_PERCENTAGE = 30
# maximum number of links in the text (hard limit)
MAX_LINK_NUMBER = 15
def check_for_link_spam(code):
"""It's spam if more than 30% of the text are links."""
spans = [x.span() for x in _link_re.finditer(code)]
if len(spans) > MAX_LINK_PERCENTAGE:
return True
return (sum(starmap(sub, spans)) * -100) / (len(code) or 1) \
> MAX_LINK_PERCENTAGE
def is_spam(code):
"""Check if the code provided contains spam."""
return check_for_link_spam(code)