Optionally reduce memory footprint

This patch adds an option to reduce program's memory footprint during
runtime and updates README file accordingly.

Instead of loading all contents into memory and then sorting the
entries, it will go entry by entry saving the oldest one.

Memory reduced footprint operation increases time to merge logs by
approximately 25%.

Option is `-m` or `--min-memory`.
This commit is contained in:
Gorka Eguileor 2016-03-20 21:03:08 +01:00
parent c0e26d3789
commit 8459bad993
2 changed files with 98 additions and 21 deletions

View File

@ -18,9 +18,18 @@ Limitations
This tool is not able to properly (or meaningfully) merge logs if your servers
are not time synced to a common time source.
This is a naive implementation, not smart at all, instead of runtime comparing
input dates as they come from log files, we create a big memory list with
all log lines, sort them, and spite them out. This can be improved.
By default os-log-merger uses a memory hogging implementation because it
provides a considerable time reduction to complete the merging. This
implementation loads all file contents in memory and then sorts and then
proceeds to output merged result.
For operation on memory constrained systems and with log files of considerable
sizes os-log-merger can operate on a memory conservative mode where log entries
will be read from files one by one and sorted as they come.
This memory reduction has an impact on processing speed, and will increase the
time to process the files by 25%.
How to install
~~~~~~~~~~~~~~
@ -52,3 +61,9 @@ The previous example would produce something like this::
References to http url files instead of local files is also supported. Files
will be cached locally to avoid re-downloading on next runs.
Limit memory usage
~~~~~~~~~~~~~~~~~~
We can disabled default speed optimized operation for those case were we want
to favor a small memory footprint by using option `-m` (`--min-memory`).

View File

@ -127,19 +127,25 @@ class OpenStackLog:
except IndexError:
return None
def log_entries(self):
entry = None
def __iter__(self):
self.entry = None
self.next_entry = None
return self
def _readline(self, entry):
while True:
line = self._file.readline()
if line == "":
break
return entry, None
try:
new_entry = self._extract_with_date(line)
if new_entry is None:
continue
if entry:
yield entry
return entry, new_entry
entry = new_entry
except ValueError:
# it's a non-dated line, just append to the entry
# extra info
@ -148,33 +154,86 @@ class OpenStackLog:
entry = (date_object, filename, pid, level,
rest + EXTRALINES_PADDING + line)
if entry:
yield entry
def __next__(self):
return self.next()
def next(self):
self.entry, self.next_entry = self._readline(self.next_entry)
if self.entry is None:
raise StopIteration()
return self.entry
def peek(self):
return self.entry
def __cmp__(self, other):
if other.peek() is None or self.peek() is None:
if self.peek() is None:
return 0 if other.peek() is None else 1
return -1
if (other.peek() or self.peek()) is None:
return 0 if self.peek() is None else -1
return cmp(self.peek()[0], other.peek()[0])
def process_logs_limit_memory_usage(logs):
oslogs = [iter(log) for log in logs]
for log in oslogs:
next(log)
while True:
entry = min(oslogs)
result = entry.peek()
if result is None:
break
yield result
try:
next(entry)
except StopIteration:
# We don't need to remove the entry, since the code works with
# files that have reached the end, but there is no point in keep
# checking a file that has already reached the EOF.
oslogs.remove(entry)
if not oslogs:
break
def process_logs_memory_hog(logs):
all_entries = []
# read all the logs
for log in logs:
for entry in log:
all_entries.append(entry)
sorted_entries = sorted(all_entries, key=lambda log_entry: log_entry[0])
for entry in sorted_entries:
yield entry
def process_logs(cfg):
all_entries = []
filename_alias = {}
logs = []
for filename in cfg.logfiles:
path, alias, is_url = get_path_and_alias(filename,
cfg.log_base,
cfg.log_postfix)
filename_alias[path] = (filename, alias, is_url)
# read the log
oslog = OpenStackLog(path)
for entry in oslog.log_entries():
all_entries.append(entry)
logs.append(OpenStackLog(path))
alias = generate_aliases(filename_alias, cfg)
sorted_entries = sorted(all_entries, key=lambda log_entry: log_entry[0])
for entry in sorted_entries:
if cfg.limit_memory:
method = process_logs_limit_memory_usage
else:
method = process_logs_memory_hog
for entry in method(logs):
(date_object, filename, pid, level, rest) = entry
print (' '.join(
[date_object.strftime("%Y-%m-%d %H:%M:%S.%f"),
'[%s]' % alias[filename], pid,
level, rest]).rstrip('\n'))
print (' '.join([date_object.strftime("%Y-%m-%d %H:%M:%S.%f"),
'[%s]' % alias[filename],
pid, level, rest]).rstrip('\n'))
def get_path_and_alias(filename, log_base, log_postfix):
@ -397,6 +456,9 @@ one has not been provided:'
parser.add_argument('--alias-level', '-a', type=int, default=0,
dest='alias_level',
help='Level of smart alias naming (0-3)')
parser.add_argument('--min-memory', '-m', default=False,
action='store_true', dest='limit_memory',
help='Limit memory usage')
return parser.parse_args()