From 27599915cb4971532bcad846184756c8481e9080 Mon Sep 17 00:00:00 2001 From: Matthew Booth Date: Wed, 20 Sep 2017 16:43:36 +0100 Subject: [PATCH] Refactor log parsing into separate parser classes All log entries are now the lighter weight LogEntry class, which is also responsible for its own output. Parsing is simplified and separated into independent parser classes. Change-Id: I264cf20933e8af007556efd7a36639f854460f49 --- oslogmerger/oslogmerger.py | 295 ++++++++++++++++--------------------- 1 file changed, 129 insertions(+), 166 deletions(-) diff --git a/oslogmerger/oslogmerger.py b/oslogmerger/oslogmerger.py index 26b74a6..d41de47 100644 --- a/oslogmerger/oslogmerger.py +++ b/oslogmerger/oslogmerger.py @@ -78,85 +78,119 @@ FILE_MAP = { class LogEntry(object): - separator = ' ' - date_format = None - _date_parse_msg = 'unconverted data remains: ' + def __init__(self, alias, dt, data, dt_str=None): + self.alias = alias + self.dt = dt + self.data = data - def __init__(self, **kwargs): - self._date_length = None - self.__dict__.update(**kwargs) - - @classmethod - def get_init_args(cls, filename): - return {} - - def prepare_line(self, line): - return line.replace('\0', ' ') - - def parse_date(self, line): - try: - dt = datetime.strptime(line, self.date_format) - except ValueError as e: - if not e.args[0].startswith(self._date_parse_msg): - raise - prepared_date_length = (len(line) - len(e.args[0]) + - len(self._date_parse_msg)) - dt = datetime.strptime(line[:prepared_date_length], - self.date_format) - self._date_length = prepared_date_length - return dt - - def _calculate_date_length(self): - return len(self.date.strftime(self.date_format)) - - @property - def date_length(self): - if not self._date_length: - self._date_length = self._calculate_date_length() - return self._date_length - - @classmethod - def factory(cls, filename, line, **kwargs): - self = cls(**kwargs) - - self.filename = filename - if not line: - raise ValueError - - # Prepare the line for date parsing - prepared_line = self.prepare_line(line) - - # Extract the datetime - self.date = self.parse_date(prepared_line) - - if (len(line) == self.date_length or - line[self.date_length] != self.separator): - raise ValueError - - self.date_str = line[:self.date_length] - # +1 to remove the separator so we don't have 2 spaces on output - self.data = line[self.date_length + 1:] - return self + if dt_str is not None: + self.dt_str = dt_str + else: + self.dt_str = self.dt.strftime('%Y-%m-%d %H:%M:%S.%f') def append_line(self, line): self.data += EXTRALINES_PADDING + line def __cmp__(self, other): - return cmp(self.date, other.date) + return cmp(self.dt, other.dt) + + def __str__(self): + return '%s [%s] %s' % (self.dt_str, self.alias, self.data.rstrip('\n')) + + +class LogParser(object): + def parse_line(self, line): + raise NotImplementedError + + +class StrptimeParser(LogParser): + date_format = None + + def __init__(self, filename): + self.date_format_words = len(self.date_format.split(' ')) + + def parse_line(self, line): + # Split the input line into words, up to . Data is + # anything after that. Join the first words to + # recreate the date. + dt_str = line.split(' ', self.date_format_words) + data = dt_str.pop() + dt_str = ' '.join(dt_str) + + dt = datetime.strptime(dt_str, self.date_format) + + # +1 to remove the separator so we don't have 2 spaces on output + return dt, dt_str, data + + +class OSLogParser(StrptimeParser): + """OpenStack default log: 2016-02-01 10:22:59.239""" + date_format = '%Y-%m-%d %H:%M:%S.%f' + + +class MsgLogParser(StrptimeParser): + """Message format: Oct 15 14:11:19""" + date_format = '%b %d %H:%M:%S' + + def __init__(self, filename): + super(MsgLogParser, self).__init__(filename) + stat = os.stat(filename) + + # TODO: handle the case where log file was closed after a year boundary + log_modified = datetime.fromtimestamp(stat.st_mtime) + self.year = log_modified.year + + def parse_line(self, line): + dt, dt_str, data = super(MsgLogParser, self).parse_line(line) + return dt.replace(self.year), dt_str, data + + +class TSLogParser(LogParser): + """Timestamped log: [275514.814982]""" + + def __init__(self, filename): + stat = os.stat(filename) + mtime = datetime.fromtimestamp(stat.st_mtime) + timestamp = self._get_last_timestamp(filename) + self.start_date = mtime - timedelta(seconds=timestamp) + + @classmethod + def _get_last_timestamp(cls, filename): + result = None + with open(filename, 'r') as f: + file_size = os.fstat(f.fileno()).st_size + # We will jump to the last KB so we don't have to read all file + offset = max(0, file_size - 1024) + f.seek(offset) + for line in f: + try: + __, result = cls._read_timestamp(line) + except ValueError: + continue + + return result + + @staticmethod + def _read_timestamp(line): + start = line.index('[') + 1 + end = line.index(']') + + if end < start: + raise ValueError + + return end, float(line[start:end]) + + def parse_line(self, line): + end, timestamp = self._read_timestamp(line) + dt = self.start_date + timedelta(seconds=timestamp) + return dt, line[:end + 1], line[end + 1:] class LogFile(object): - log_entry_class = LogEntry - - @staticmethod - def factory(cls, filename): - instance = LogFile(filename) - instance.log_entry_class = cls - instance.entry_kwargs = cls.get_init_args(filename) - return instance - - def __init__(self, filename): + def __init__(self, filename, alias, parser_cls): self.open(filename) + self.alias = alias + self.parser = parser_cls(filename) def open(self, filename): self._filename = filename @@ -164,8 +198,6 @@ class LogFile(object): filename = self._cached_download(filename) self._file = open(filename, 'r') - stat = os.stat(filename) - self.mtime = datetime.fromtimestamp(stat.st_mtime) def _url_cache_path(self, url): md5 = hashlib.md5() @@ -207,18 +239,16 @@ class LogFile(object): line = self._file.readline() if line == "": return entry, None + line.replace('\0', ' ') try: - new_entry = self.log_entry_class.factory(self._filename, - line, - **self.entry_kwargs) - if new_entry is None: - continue + dt, dt_str, data = self.parser.parse_line(line) + new_entry = LogEntry(self.alias, dt, data, dt_str=dt_str) if entry: return entry, new_entry entry = new_entry - except Exception: + except ValueError: # it's probably a non-dated line, or a garbled entry, just # append to the entry extra info if entry: @@ -247,104 +277,37 @@ class LogFile(object): return cmp(self.peek(), other.peek()) -class MsgLogEntry(LogEntry): - """Message format: Oct 15 14:11:19""" - date_format = '%Y%b %d %H:%M:%S' - - @classmethod - def get_init_args(cls, filename): - kwargs = super(MsgLogEntry, cls).get_init_args(filename) - stat = os.stat(filename) - kwargs['file_year'] = datetime.fromtimestamp(stat.st_mtime).year - return kwargs - - def prepare_line(self, line): - # TODO: If year of file creation and file last modification are - # different we should start with the cration year and then change to - # the next year once the months go back. - line = super(MsgLogEntry, self).prepare_line(line) - return '%s%s' % (self.file_year, line) - - def _calculate_date_length(self): - return super(MsgLogEntry, self)._calculate_date_length() - 4 - - -class OSLogEntry(LogEntry): - """OpenStack default log: 2016-02-01 10:22:59.239""" - date_format = '%Y-%m-%d %H:%M:%S.%f' - - def _calculate_date_length(self): - return super(OSLogEntry, self)._calculate_date_length() - 3 - - -class TSLogEntry(LogEntry): - """Timestamped log: [275514.814982]""" - - @classmethod - def get_init_args(cls, filename): - kwargs = super(TSLogEntry, cls).get_init_args(filename) - stat = os.stat(filename) - mtime = datetime.fromtimestamp(stat.st_mtime) - timestamp = cls._get_last_timestamp(filename) - kwargs['start_date'] = mtime - timedelta(seconds=timestamp) - return kwargs - - @classmethod - def _get_last_timestamp(cls, filename): - result = None - with open(filename, 'r') as f: - file_size = os.fstat(f.fileno()).st_size - # We will jump to the last KB so we don't have to read all file - offset = max(0, file_size - 1024) - f.seek(offset) - for line in f: - try: - __, result = cls._read_timestamp(line) - except ValueError: - continue - - return result - - @staticmethod - def _read_timestamp(line): - start = line.index('[') + 1 - end = line.index(']') - - if end < start: - raise ValueError - - return end, float(line[start:end]) - - def parse_date(self, date_str): - end, timestamp = self._read_timestamp(date_str) - self._date_length = end + 1 - return self.start_date + timedelta(seconds=timestamp) - - -LOG_TYPES = [ - ('logfiles', OSLogEntry), - ('logfiles_m', MsgLogEntry), - ('logfiles_t', TSLogEntry), -] +LOG_TYPES = { + 'logfiles': OSLogParser, + 'logfiles_m': MsgLogParser, + 'logfiles_t': TSLogParser, +} def process_logs(cfg): filename_alias = {} logs = [] - for arg_name, entry_cls in LOG_TYPES: - for filename in getattr(cfg, arg_name): - path, alias, is_url = get_path_and_alias(filename, - cfg.log_base, - cfg.log_postfix) - filename_alias[path] = (filename, alias, is_url) - logs.append(LogFile.factory(entry_cls, path)) - alias = generate_aliases(filename_alias, cfg) + paths_aliases = {} + paths_parsers = {} + for arg_name, parser_cls in LOG_TYPES.items(): + for filename in getattr(cfg, arg_name): + path, alias, is_url = get_path_and_alias(filename, cfg.log_base, + cfg.log_postfix) + paths_aliases[path] = (filename, alias, is_url) + paths_parsers[path] = parser_cls + + # NOTE(mdbooth): I feel like generate_aliases should take a single path, + # which would make this loop much tidier. I don't want to unpick it right + # now, though. + aliases = generate_aliases(paths_aliases, cfg) + + logs = [LogFile(path, aliases[path], parser_cls) + for path, parser_cls in paths_parsers.items()] entry_iters = [iter(log) for log in logs] for entry in heapq.merge(*entry_iters): - print('%s [%s] %s' % (entry.date_str, alias[entry.filename], - entry.data.rstrip('\n'))) + print(entry) def get_path_and_alias(filename, log_base, log_postfix):