Add periodic task to remove postgres archived wal files

* Added a periodic task for postgresql datastore to clean up the
  archived WAL files.
* Added a check when creating incremental backups for postgresql.
* A new container image ``openstacktrove/db-backup-postgresql:1.1.2``
  is uploaded to docker hub.

Story: 2009066
Task: 42871
Change-Id: I235e2abf8c0405e143ded6fb48017d596b8b41a1
This commit is contained in:
Lingxian Kong 2021-07-22 16:38:08 +12:00
parent 69f08ab470
commit 02971d850b
17 changed files with 333 additions and 37 deletions

View File

@ -70,12 +70,13 @@ class PgBasebackup(base.BaseRunner):
if wal_re.search(wal_file) and wal_file >= last_wal]
return wal_files
def get_backup_file(self, backup_pos=0):
def get_backup_file(self, backup_pos=0, regex=None):
"""Look for the most recent .backup file that basebackup creates
:return: a string like 000000010000000000000006.00000168.backup
"""
backup_re = re.compile("[0-9A-F]{24}.*.backup")
regex = regex or r"[0-9A-F]{24}\..*\.backup"
backup_re = re.compile(regex)
wal_files = [wal_file for wal_file in os.listdir(self.wal_archive_dir)
if backup_re.search(wal_file)]
wal_files = sorted(wal_files, reverse=True)
@ -177,12 +178,20 @@ class PgBasebackupIncremental(PgBasebackup):
def __init__(self, *args, **kwargs):
self.parent_location = kwargs.pop('parent_location', '')
self.parent_checksum = kwargs.pop('parent_checksum', '')
self.parent_stop_wal = kwargs.pop('stop_wal_file', '')
super(PgBasebackupIncremental, self).__init__(*args, **kwargs)
self.incr_restore_cmd = f'tar -xzf - -C {self.wal_archive_dir}'
def pre_backup(self):
# Check if the parent stop wal file still exists. It may be removed
# by trove-guestagent.
parent_wal_name = self.get_backup_file(
backup_pos=0, regex=fr'{self.parent_stop_wal}\..+\.backup')
if not parent_wal_name:
raise Exception("Cannot find parent backup WAL file.")
with psql_util.PostgresConnection('postgres') as conn:
self.start_segment = conn.query(
f"SELECT pg_start_backup('{self.filename}', false, false)"

View File

@ -90,4 +90,4 @@ TROVE_NON_DEV_IMAGE_URL=${TROVE_NON_DEV_IMAGE_URL:-""}
TROVE_DATABASE_IMAGE_MYSQL=${TROVE_DATABASE_IMAGE_MYSQL:-"catalystcloud/mysql"}
TROVE_DATABASE_IMAGE_POSTGRES=${TROVE_DATABASE_IMAGE_POSTGRES:-"catalystcloud/postgres"}
TROVE_DATABASE_BACKUP_IMAGE_MYSQL=${TROVE_DATABASE_BACKUP_IMAGE_MYSQL:-"catalystcloud/db-backup-mysql:1.1.0"}
TROVE_DATABASE_BACKUP_IMAGE_POSTGRES=${TROVE_DATABASE_BACKUP_IMAGE_POSTGRES:-"catalystcloud/db-backup-postgresql:1.1.0"}
TROVE_DATABASE_BACKUP_IMAGE_POSTGRES=${TROVE_DATABASE_BACKUP_IMAGE_POSTGRES:-"catalystcloud/db-backup-postgresql:1.1.2"}

View File

@ -0,0 +1,24 @@
.. _database_management:
===================
Database Management
===================
PostgreSQL
----------
WAL(Write Ahead Log)
~~~~~~~~~~~~~~~~~~~~
By default, ``archive_mode`` is enabled in order to create incremental database backup, which is triggered by the users. ``archive_command`` is configured as well for continuous WAL archiving, the WAL files in pg_wal subdirectory are copied to ``/var/lib/postgresql/data/wal_archive``.
That is going to be a problem if the WAL segment files in the archive folder keep increasing, especially in the busy system, several TBs of WALs can be piled up in archive destination(part of the data volume), which will lead to the database service unavailable.
In the PostgreSQL manager of trove-guestagent, there is a periodic task aiming at cleaning up the archive folder, when it's running, it checks the size of the archive folder, if the size is greater than half of the data volume size, in the archive folder:
1. If there is no ``.backup`` file, it means the database has never been backed up before, all the WAL segment files except for the latest one are removed.
2. If there are ``.backup`` files, remove all the files older than the backup file. Check the size again, if the size condition is still met, all the WAL segment files except for the latest one are removed.
When creating the incremental backup, trove will check if the parent backup file still exists in the archive folder, the backup creation will fail if that's not found. The user is able to see the error message in the instance detail and has to create full backup instead.
Another option is to archive WAL files to Swift(in the user's account), e.g. using WAL-G or other 3rd party tools, but that will incur charges for the object storage usage which is not optimal. We leave it to the users to decide when and how the backups should be created.

View File

@ -10,4 +10,5 @@
datastore
building_guest_images
secure_oslo_messaging
database_management
troubleshooting

View File

@ -333,3 +333,25 @@ object URL), the local datastore version and the backup data size are required.
| status | RESTORED |
| updated | 2021-02-22T01:44:06 |
+----------------------+---------------------------------------------------------------------------------------------------------------------------------------+
Troubleshooting
---------------
Failed to create incremental backup for PostgreSQL
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
One possible reason could be it has been a long time since the parent backup was created, and the parent backup WAL file is removed internally because of disk pressure, it could be confirmed by checking the instance detail, e.g.
.. code-block:: console
$ openstack database instance show e7231e46-ca3b-4dce-bf67-739b3af0ef85 -c fault
+-------+----------------------------------------------------------------------+
| Field | Value |
+-------+----------------------------------------------------------------------+
| fault | Failed to create backup c76de467-6587-4e27-bb8d-7c3d3b136663, error: |
| | Cannot find parent backup WAL file. |
+-------+----------------------------------------------------------------------+
In this case, you have to create full backup instead.
To avoid this issue in the future, you can set up a cron job to create (incremental) backups regularly.

View File

@ -0,0 +1,6 @@
---
fixes:
- Added a periodic task for postgresql datastore to clean up the archived WAL
files. Added a check when creating incremental backups for postgresql. A
new container image ``openstacktrove/db-backup-postgresql:1.1.2`` is
uploaded to docker hub.

View File

@ -1091,13 +1091,18 @@ postgresql_group = cfg.OptGroup(
'postgresql', title='PostgreSQL options',
help="Oslo option group for the PostgreSQL datastore.")
postgresql_opts = [
cfg.BoolOpt(
'enable_clean_wal_archives',
default=True,
help='Enable the periodic job to clean up WAL archive folder.'
),
cfg.StrOpt(
'docker_image', default='postgres',
help='Database docker image.'
),
cfg.StrOpt(
'backup_docker_image',
default='openstacktrove/db-backup-postgresql:1.1.1',
default='openstacktrove/db-backup-postgresql:1.1.2',
help='The docker image used for backup and restore.'
),
cfg.BoolOpt('icmp', default=False,
@ -1131,7 +1136,11 @@ postgresql_opts = [
cfg.StrOpt('wal_archive_location', default='/mnt/wal_archive',
help="Filesystem path storing WAL archive files when "
"WAL-shipping based backups or replication "
"is enabled."),
"is enabled.",
deprecated_for_removal=True,
deprecated_reason='Option is not used any more, will be '
'removed in future release.'
),
cfg.BoolOpt('root_on_create', default=False,
help='Enable the automatic creation of the root user for the '
'service during instance-create. The generated password for '
@ -1154,7 +1163,7 @@ postgresql_opts = [
"statement logging.",
deprecated_for_removal=True,
deprecated_reason='Will be replaced by configuration group '
'option: log_min_duration_statement'),
'option: log_min_duration_statement'),
cfg.IntOpt('default_password_length', default=36,
help='Character length of generated passwords.',
deprecated_name='default_password_length',

View File

@ -17,6 +17,7 @@ from functools import reduce
import inspect
import operator
import os
from pathlib import Path
import pwd
import re
import stat
@ -904,3 +905,19 @@ def remove_dir_contents(folder):
"""
path = os.path.join(folder, '*')
execute_shell_cmd(f'rm -rf {path}', [], shell=True, as_root=True)
def get_dir_size(path):
"""Get the directory size in bytes."""
root_directory = Path(path)
return sum(f.stat().st_size for f in root_directory.glob('**/*')
if f.is_file())
def get_filesystem_size(path):
"""Get size(bytes) of a mounted filesystem the given path locates.
path is the pathname of any file within the mounted filesystem.
"""
ret = os.statvfs(path)
return ret.f_blocks * ret.f_frsize

View File

@ -64,7 +64,15 @@ class Manager(periodic_task.PeriodicTasks):
MODULE_APPLY_TO_ALL = module_manager.ModuleManager.MODULE_APPLY_TO_ALL
docker_client = docker.from_env()
_docker_client = None
@property
def docker_client(self):
if self._docker_client:
return self._docker_client
self._docker_client = docker.from_env()
return self._docker_client
def __init__(self, manager_name):
super(Manager, self).__init__(CONF)

View File

@ -423,11 +423,19 @@ class BaseMySqlAdmin(object, metaclass=abc.ABCMeta):
class BaseMySqlApp(service.BaseDbApp):
configuration_manager = ConfigurationManager(
MYSQL_CONFIG, CONF.database_service_uid, CONF.database_service_uid,
service.BaseDbApp.CFG_CODEC, requires_root=True,
override_strategy=ImportOverrideStrategy(CNF_INCLUDE_DIR, CNF_EXT)
)
_configuration_manager = None
@property
def configuration_manager(self):
if self._configuration_manager:
return self._configuration_manager
self._configuration_manager = ConfigurationManager(
MYSQL_CONFIG, CONF.database_service_uid, CONF.database_service_uid,
service.BaseDbApp.CFG_CODEC, requires_root=True,
override_strategy=ImportOverrideStrategy(CNF_INCLUDE_DIR, CNF_EXT)
)
return self._configuration_manager
def get_engine(self):
"""Create the default engine with the updated admin user.
@ -460,14 +468,12 @@ class BaseMySqlApp(service.BaseDbApp):
with mysql_util.SqlClient(self.get_engine()) as client:
return client.execute(sql_statement)
@classmethod
def get_data_dir(cls):
return cls.configuration_manager.get_value(
def get_data_dir(self):
return self.configuration_manager.get_value(
'datadir', section=MySQLConfParser.SERVER_CONF_SECTION)
@classmethod
def set_data_dir(cls, value):
cls.configuration_manager.apply_system_override(
def set_data_dir(self, value):
self.configuration_manager.apply_system_override(
{MySQLConfParser.SERVER_CONF_SECTION: {'datadir': value}})
def _create_admin_user(self, client, password):

View File

@ -12,8 +12,10 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import re
from oslo_log import log as logging
from oslo_service import periodic_task
from trove.common import cfg
from trove.common import exception
@ -40,6 +42,82 @@ class PostgresManager(manager.Manager):
def configuration_manager(self):
return self.app.configuration_manager
def _check_wal_archive_size(self, archive_path, data_path):
"""Check wal archive folder size.
Return True if the size is greater than half of the data volume size.
"""
archive_size = operating_system.get_dir_size(archive_path)
data_volume_size = operating_system.get_filesystem_size(data_path)
if archive_size > (data_volume_size / 2):
LOG.info(f"The size({archive_size}) of wal archive folder is "
f"greater than half of the data volume "
f"size({data_volume_size})")
return True
return False
def _remove_older_files(self, archive_path, files, cur_file):
"""Remove files older than cur_file.
:param archive_path: The archive folder
:param files: List of the ordered file names.
:param cur_file: The compared file name.
"""
cur_seq = os.path.basename(cur_file).split('.')[0]
wal_re = re.compile(r"^([0-9A-F]{24}).*")
for wal_file in files:
m = wal_re.search(wal_file)
if m and m.group(1) < cur_seq:
file_path = os.path.join(archive_path, wal_file)
LOG.info(f"Removing wal file {file_path}")
operating_system.remove(
path=file_path, force=True, recursive=False, as_root=True)
def _remove_wals(self, archive_path, force=False):
"""Remove wal files.
If force=True, do not consider backup.
"""
files = os.listdir(archive_path)
files = sorted(files, reverse=True)
wal_files = []
if not force:
# Get latest backup file
backup_re = re.compile("[0-9A-F]{24}.*.backup")
wal_files = [wal_file for wal_file in files
if backup_re.search(wal_file)]
# If there is no backup file or force=True, remove all except the
# latest one, otherwise, remove all the files older than the backup
# file
wal_files = wal_files or files
self._remove_older_files(archive_path, files, wal_files[0])
def _clean_wals(self, archive_path, data_path, force=False):
if self._check_wal_archive_size(archive_path, data_path):
self._remove_wals(archive_path, force)
# check again with force=True
self._clean_wals(archive_path, data_path, force=True)
@periodic_task.periodic_task(
enabled=CONF.postgresql.enable_clean_wal_archives,
spacing=180)
def clean_wal_archives(self, context):
"""Clean up the wal archives to free up disk space."""
archive_path = service.WAL_ARCHIVE_DIR
data_path = cfg.get_configuration_property('mount_point')
if not operating_system.exists(archive_path, is_directory=True,
as_root=True):
return
self._clean_wals(archive_path, data_path)
def do_prepare(self, context, packages, databases, memory_mb, users,
device_path, mount_point, backup_info,
config_contents, root_password, overrides,

View File

@ -74,18 +74,26 @@ class PgSqlAppStatus(service.BaseDbStatus):
class PgSqlApp(service.BaseDbApp):
configuration_manager = configuration.ConfigurationManager(
CONFIG_FILE,
CONF.database_service_uid,
CONF.database_service_uid,
stream_codecs.KeyValueCodec(
value_quoting=True,
bool_case=stream_codecs.KeyValueCodec.BOOL_LOWER,
big_ints=True),
requires_root=True,
override_strategy=configuration.ImportOverrideStrategy(
CNF_INCLUDE_DIR, CNF_EXT)
)
_configuration_manager = None
@property
def configuration_manager(self):
if self._configuration_manager:
return self._configuration_manager
self._configuration_manager = configuration.ConfigurationManager(
CONFIG_FILE,
CONF.database_service_uid,
CONF.database_service_uid,
stream_codecs.KeyValueCodec(
value_quoting=True,
bool_case=stream_codecs.KeyValueCodec.BOOL_LOWER,
big_ints=True),
requires_root=True,
override_strategy=configuration.ImportOverrideStrategy(
CNF_INCLUDE_DIR, CNF_EXT)
)
return self._configuration_manager
def __init__(self, status, docker_client):
super(PgSqlApp, self).__init__(status, docker_client)
@ -96,13 +104,11 @@ class PgSqlApp(service.BaseDbApp):
self.datadir = f"{mount_point}/data/pgdata"
self.adm = PgSqlAdmin(SUPER_USER_NAME)
@classmethod
def get_data_dir(cls):
return cls.configuration_manager.get_value('data_directory')
def get_data_dir(self):
return self.configuration_manager.get_value('data_directory')
@classmethod
def set_data_dir(cls, value):
cls.configuration_manager.apply_system_override(
def set_data_dir(self, value):
self.configuration_manager.apply_system_override(
{'data_directory': value})
def reload(self):

View File

@ -509,7 +509,16 @@ class BaseDbApp(object):
'success': False,
'state': BackupState.FAILED,
})
raise Exception(msg)
# The exception message is visible to the user
user_msg = msg
ex_regex = re.compile(r'.+Exception: (.+)')
for line in output[-5:-1]:
m = ex_regex.search(line)
if m:
user_msg = m.group(1)
break
raise Exception(user_msg)
except Exception as err:
LOG.error("Failed to create backup %s", backup_id)
backup_state.update({

View File

@ -0,0 +1,101 @@
# Copyright 2021 Catalyst Cloud
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
from unittest import mock
from trove.guestagent.datastore.postgres import manager
from trove.guestagent.datastore.postgres import service
from trove.tests.unittests import trove_testtools
class TestPostgresManager(trove_testtools.TestCase):
def setUp(self):
super(TestPostgresManager, self).setUp()
manager.PostgresManager._docker_client = mock.MagicMock()
self.patch_datastore_manager('postgresql')
@mock.patch('trove.guestagent.common.operating_system.remove')
@mock.patch('os.listdir')
@mock.patch('trove.guestagent.common.operating_system.get_filesystem_size')
@mock.patch('trove.guestagent.common.operating_system.get_dir_size')
@mock.patch('trove.guestagent.common.operating_system.exists')
def test_clean_wal_archives(self, mock_exists, mock_get_dir_size,
mock_get_filesystem_size, mock_listdir,
mock_remove):
mock_exists.return_value = True
mock_get_dir_size.side_effect = [6, 1]
mock_get_filesystem_size.return_value = 10
mock_listdir.return_value = [
'0000000100000002000000D4',
'00000001000000000000008D',
'0000000100000000000000A7.00000028.backup',
'0000000100000000000000A7',
'0000000100000002000000E7'
]
psql_manager = manager.PostgresManager()
psql_manager.clean_wal_archives(mock.ANY)
self.assertEqual(1, mock_remove.call_count)
archive_path = service.WAL_ARCHIVE_DIR
expected_calls = [
mock.call(
path=os.path.join(archive_path, '00000001000000000000008D'),
force=True, recursive=False,
as_root=True),
]
self.assertEqual(expected_calls, mock_remove.call_args_list)
@mock.patch('trove.guestagent.common.operating_system.remove')
@mock.patch('os.listdir')
@mock.patch('trove.guestagent.common.operating_system.get_filesystem_size')
@mock.patch('trove.guestagent.common.operating_system.get_dir_size')
@mock.patch('trove.guestagent.common.operating_system.exists')
def test_clean_wal_archives_no_backups(self, mock_exists,
mock_get_dir_size,
mock_get_filesystem_size,
mock_listdir,
mock_remove):
mock_exists.return_value = True
mock_get_dir_size.side_effect = [6, 1]
mock_get_filesystem_size.return_value = 10
mock_listdir.return_value = [
'0000000100000002000000D4',
'00000001000000000000008D',
'0000000100000000000000A7',
'0000000100000002000000E7'
]
psql_manager = manager.PostgresManager()
psql_manager.clean_wal_archives(mock.ANY)
self.assertEqual(3, mock_remove.call_count)
archive_path = service.WAL_ARCHIVE_DIR
expected_calls = [
mock.call(
path=os.path.join(archive_path, '0000000100000002000000D4'),
force=True, recursive=False,
as_root=True),
mock.call(
path=os.path.join(archive_path, '0000000100000000000000A7'),
force=True, recursive=False,
as_root=True),
mock.call(
path=os.path.join(archive_path, '00000001000000000000008D'),
force=True, recursive=False,
as_root=True),
]
self.assertEqual(expected_calls, mock_remove.call_args_list)