Refactor log parsing into separate parser classes

All log entries are now the lighter weight LogEntry class, which is
also responsible for its own output. Parsing is simplified and
separated into independent parser classes.

Change-Id: I264cf20933e8af007556efd7a36639f854460f49
This commit is contained in:
Matthew Booth 2017-09-20 16:43:36 +01:00
parent f2be96fe6a
commit 27599915cb
1 changed files with 129 additions and 166 deletions

View File

@ -78,85 +78,119 @@ FILE_MAP = {
class LogEntry(object):
separator = ' '
date_format = None
_date_parse_msg = 'unconverted data remains: '
def __init__(self, alias, dt, data, dt_str=None):
self.alias = alias
self.dt = dt
self.data = data
def __init__(self, **kwargs):
self._date_length = None
self.__dict__.update(**kwargs)
@classmethod
def get_init_args(cls, filename):
return {}
def prepare_line(self, line):
return line.replace('\0', ' ')
def parse_date(self, line):
try:
dt = datetime.strptime(line, self.date_format)
except ValueError as e:
if not e.args[0].startswith(self._date_parse_msg):
raise
prepared_date_length = (len(line) - len(e.args[0]) +
len(self._date_parse_msg))
dt = datetime.strptime(line[:prepared_date_length],
self.date_format)
self._date_length = prepared_date_length
return dt
def _calculate_date_length(self):
return len(self.date.strftime(self.date_format))
@property
def date_length(self):
if not self._date_length:
self._date_length = self._calculate_date_length()
return self._date_length
@classmethod
def factory(cls, filename, line, **kwargs):
self = cls(**kwargs)
self.filename = filename
if not line:
raise ValueError
# Prepare the line for date parsing
prepared_line = self.prepare_line(line)
# Extract the datetime
self.date = self.parse_date(prepared_line)
if (len(line) == self.date_length or
line[self.date_length] != self.separator):
raise ValueError
self.date_str = line[:self.date_length]
# +1 to remove the separator so we don't have 2 spaces on output
self.data = line[self.date_length + 1:]
return self
if dt_str is not None:
self.dt_str = dt_str
else:
self.dt_str = self.dt.strftime('%Y-%m-%d %H:%M:%S.%f')
def append_line(self, line):
self.data += EXTRALINES_PADDING + line
def __cmp__(self, other):
return cmp(self.date, other.date)
return cmp(self.dt, other.dt)
def __str__(self):
return '%s [%s] %s' % (self.dt_str, self.alias, self.data.rstrip('\n'))
class LogParser(object):
def parse_line(self, line):
raise NotImplementedError
class StrptimeParser(LogParser):
date_format = None
def __init__(self, filename):
self.date_format_words = len(self.date_format.split(' '))
def parse_line(self, line):
# Split the input line into words, up to <date_format_words>. Data is
# anything after that. Join the first <date_format_words> words to
# recreate the date.
dt_str = line.split(' ', self.date_format_words)
data = dt_str.pop()
dt_str = ' '.join(dt_str)
dt = datetime.strptime(dt_str, self.date_format)
# +1 to remove the separator so we don't have 2 spaces on output
return dt, dt_str, data
class OSLogParser(StrptimeParser):
"""OpenStack default log: 2016-02-01 10:22:59.239"""
date_format = '%Y-%m-%d %H:%M:%S.%f'
class MsgLogParser(StrptimeParser):
"""Message format: Oct 15 14:11:19"""
date_format = '%b %d %H:%M:%S'
def __init__(self, filename):
super(MsgLogParser, self).__init__(filename)
stat = os.stat(filename)
# TODO: handle the case where log file was closed after a year boundary
log_modified = datetime.fromtimestamp(stat.st_mtime)
self.year = log_modified.year
def parse_line(self, line):
dt, dt_str, data = super(MsgLogParser, self).parse_line(line)
return dt.replace(self.year), dt_str, data
class TSLogParser(LogParser):
"""Timestamped log: [275514.814982]"""
def __init__(self, filename):
stat = os.stat(filename)
mtime = datetime.fromtimestamp(stat.st_mtime)
timestamp = self._get_last_timestamp(filename)
self.start_date = mtime - timedelta(seconds=timestamp)
@classmethod
def _get_last_timestamp(cls, filename):
result = None
with open(filename, 'r') as f:
file_size = os.fstat(f.fileno()).st_size
# We will jump to the last KB so we don't have to read all file
offset = max(0, file_size - 1024)
f.seek(offset)
for line in f:
try:
__, result = cls._read_timestamp(line)
except ValueError:
continue
return result
@staticmethod
def _read_timestamp(line):
start = line.index('[') + 1
end = line.index(']')
if end < start:
raise ValueError
return end, float(line[start:end])
def parse_line(self, line):
end, timestamp = self._read_timestamp(line)
dt = self.start_date + timedelta(seconds=timestamp)
return dt, line[:end + 1], line[end + 1:]
class LogFile(object):
log_entry_class = LogEntry
@staticmethod
def factory(cls, filename):
instance = LogFile(filename)
instance.log_entry_class = cls
instance.entry_kwargs = cls.get_init_args(filename)
return instance
def __init__(self, filename):
def __init__(self, filename, alias, parser_cls):
self.open(filename)
self.alias = alias
self.parser = parser_cls(filename)
def open(self, filename):
self._filename = filename
@ -164,8 +198,6 @@ class LogFile(object):
filename = self._cached_download(filename)
self._file = open(filename, 'r')
stat = os.stat(filename)
self.mtime = datetime.fromtimestamp(stat.st_mtime)
def _url_cache_path(self, url):
md5 = hashlib.md5()
@ -207,18 +239,16 @@ class LogFile(object):
line = self._file.readline()
if line == "":
return entry, None
line.replace('\0', ' ')
try:
new_entry = self.log_entry_class.factory(self._filename,
line,
**self.entry_kwargs)
if new_entry is None:
continue
dt, dt_str, data = self.parser.parse_line(line)
new_entry = LogEntry(self.alias, dt, data, dt_str=dt_str)
if entry:
return entry, new_entry
entry = new_entry
except Exception:
except ValueError:
# it's probably a non-dated line, or a garbled entry, just
# append to the entry extra info
if entry:
@ -247,104 +277,37 @@ class LogFile(object):
return cmp(self.peek(), other.peek())
class MsgLogEntry(LogEntry):
"""Message format: Oct 15 14:11:19"""
date_format = '%Y%b %d %H:%M:%S'
@classmethod
def get_init_args(cls, filename):
kwargs = super(MsgLogEntry, cls).get_init_args(filename)
stat = os.stat(filename)
kwargs['file_year'] = datetime.fromtimestamp(stat.st_mtime).year
return kwargs
def prepare_line(self, line):
# TODO: If year of file creation and file last modification are
# different we should start with the cration year and then change to
# the next year once the months go back.
line = super(MsgLogEntry, self).prepare_line(line)
return '%s%s' % (self.file_year, line)
def _calculate_date_length(self):
return super(MsgLogEntry, self)._calculate_date_length() - 4
class OSLogEntry(LogEntry):
"""OpenStack default log: 2016-02-01 10:22:59.239"""
date_format = '%Y-%m-%d %H:%M:%S.%f'
def _calculate_date_length(self):
return super(OSLogEntry, self)._calculate_date_length() - 3
class TSLogEntry(LogEntry):
"""Timestamped log: [275514.814982]"""
@classmethod
def get_init_args(cls, filename):
kwargs = super(TSLogEntry, cls).get_init_args(filename)
stat = os.stat(filename)
mtime = datetime.fromtimestamp(stat.st_mtime)
timestamp = cls._get_last_timestamp(filename)
kwargs['start_date'] = mtime - timedelta(seconds=timestamp)
return kwargs
@classmethod
def _get_last_timestamp(cls, filename):
result = None
with open(filename, 'r') as f:
file_size = os.fstat(f.fileno()).st_size
# We will jump to the last KB so we don't have to read all file
offset = max(0, file_size - 1024)
f.seek(offset)
for line in f:
try:
__, result = cls._read_timestamp(line)
except ValueError:
continue
return result
@staticmethod
def _read_timestamp(line):
start = line.index('[') + 1
end = line.index(']')
if end < start:
raise ValueError
return end, float(line[start:end])
def parse_date(self, date_str):
end, timestamp = self._read_timestamp(date_str)
self._date_length = end + 1
return self.start_date + timedelta(seconds=timestamp)
LOG_TYPES = [
('logfiles', OSLogEntry),
('logfiles_m', MsgLogEntry),
('logfiles_t', TSLogEntry),
]
LOG_TYPES = {
'logfiles': OSLogParser,
'logfiles_m': MsgLogParser,
'logfiles_t': TSLogParser,
}
def process_logs(cfg):
filename_alias = {}
logs = []
for arg_name, entry_cls in LOG_TYPES:
for filename in getattr(cfg, arg_name):
path, alias, is_url = get_path_and_alias(filename,
cfg.log_base,
cfg.log_postfix)
filename_alias[path] = (filename, alias, is_url)
logs.append(LogFile.factory(entry_cls, path))
alias = generate_aliases(filename_alias, cfg)
paths_aliases = {}
paths_parsers = {}
for arg_name, parser_cls in LOG_TYPES.items():
for filename in getattr(cfg, arg_name):
path, alias, is_url = get_path_and_alias(filename, cfg.log_base,
cfg.log_postfix)
paths_aliases[path] = (filename, alias, is_url)
paths_parsers[path] = parser_cls
# NOTE(mdbooth): I feel like generate_aliases should take a single path,
# which would make this loop much tidier. I don't want to unpick it right
# now, though.
aliases = generate_aliases(paths_aliases, cfg)
logs = [LogFile(path, aliases[path], parser_cls)
for path, parser_cls in paths_parsers.items()]
entry_iters = [iter(log) for log in logs]
for entry in heapq.merge(*entry_iters):
print('%s [%s] %s' % (entry.date_str, alias[entry.filename],
entry.data.rstrip('\n')))
print(entry)
def get_path_and_alias(filename, log_base, log_postfix):