commit 5cb20a610ff2b76f5f196dd93928a9102a1055a1 Author: Dan Voiculeasa Date: Fri Sep 18 12:59:22 2020 +0300 Change restore procedure During restore playbook, before controller-0 unlock there will be a flag file created to indicate that the system is going through a system restore. Exiting the system restore state is done through a command after all nodes are up and unlocked. Next commit will introduce sysinv commands to query and control the system restore state. Until all nodes are up, pods are stuck in `Terminating`. Armada will timeout waiting for those pods, if an armada apply is requested. This commit ensures auto-apply of apps does not occur during the system restore. While in the restore state, allow apps to have their images downloaded. If an image download failed, revert the status of the app to APP_RESTORE_REQUESTED instead of APP_APPLY_FAILURE. The auto image download is tried for apps in APP_RESTORE_REQUESTED until the system restore state is exited. This commit ensures enough time for manual intervention to fix the networking, docker registries connectivity or any other issues related to container images. Note: In the case of multi-nodes setups helm overrides may have been detected so apps will be auto-applied after exiting the restore state. The auto apply is started by a peridic audit thread. Change-Id: I44fc4aaa528e372a84115714f271b4f5e063f86e Partial-Bug: 1887648 Signed-off-by: Dan Voiculeasa diff --git a/sysinv/sysinv/sysinv/sysinv/common/constants.py b/sysinv/sysinv/sysinv/sysinv/common/constants.py index 9d70729..765f1f4 100644 --- a/sysinv/sysinv/sysinv/sysinv/common/constants.py +++ b/sysinv/sysinv/sysinv/sysinv/common/constants.py @@ -17,6 +17,8 @@ SYSINV_CONFIG_FILE_LOCAL = '/etc/sysinv/sysinv.conf' SYSINV_CONF_DEFAULT_FILE = 'sysinv.conf.default' SYSINV_CONF_DEFAULT_PATH = os.path.join(SYSINV_CONFIG_PATH, SYSINV_CONF_DEFAULT_FILE) +SYSINV_RESTORE_FLAG = os.path.join(SYSINV_CONFIG_PATH, + ".restore_in_progress") HTTPS_CONFIG_REQUIRED = os.path.join(tsc.CONFIG_PATH, '.https_config_required') ADMIN_ENDPOINT_CONFIG_REQUIRED = os.path.join(tsc.CONFIG_PATH, '.admin_endpoint_config_required') diff --git a/sysinv/sysinv/sysinv/sysinv/conductor/manager.py b/sysinv/sysinv/sysinv/sysinv/conductor/manager.py index 7015fa9..302fb3b 100644 --- a/sysinv/sysinv/sysinv/sysinv/conductor/manager.py +++ b/sysinv/sysinv/sysinv/sysinv/conductor/manager.py @@ -203,9 +203,6 @@ class ConductorManager(service.PeriodicService): # this will track the config w/ reboot request to apply self._host_reboot_config_uuid = {} - # Guard for a run once function - self._requested_restore = False - def start(self): self._start() # accept API calls and run periodic tasks after @@ -5391,7 +5388,6 @@ class ConductorManager(service.PeriodicService): greenthread.spawn(self._restore_download_images, app) - self._requested_restore = True except Exception as e: LOG.info("Helper Task: _k8s_application_images_audit: Will retry") LOG.exception(e) @@ -5408,7 +5404,7 @@ class ConductorManager(service.PeriodicService): app.save() except Exception as e: LOG.exception(e) - app.status = constants.APP_APPLY_FAILURE + app.status = constants.APP_RESTORE_REQUESTED app.progress = constants.APP_PROGRESS_IMAGES_DOWNLOAD_FAILED app.save() @@ -5433,6 +5429,10 @@ class ConductorManager(service.PeriodicService): return False + @staticmethod + def _verify_restore_in_progress(): + return os.path.isfile(constants.SYSINV_RESTORE_FLAG) + @periodic_task.periodic_task(spacing=CONF.conductor.audit_interval, run_immediately=True) def _k8s_application_audit(self, context): @@ -5457,8 +5457,11 @@ class ConductorManager(service.PeriodicService): LOG.debug("Software update orchestration in progress. Defer audit.") return - if not self._requested_restore: + if self._verify_restore_in_progress(): self._k8s_application_images_audit(context) + LOG.info("Restore in progress - defer platform managed application " + "activity") + return # Ensure that armada pod is running. pods = self._kube.kube_get_pods_by_selector("armada", @@ -5515,6 +5518,10 @@ class ConductorManager(service.PeriodicService): LOG.debug("Periodic Task: _k8s_application_audit: Finished") def check_pending_app_reapply(self, context): + if self._verify_restore_in_progress(): + LOG.info("Restore in progress - Ignore app reapply checks.") + return + # Defer application reapply while an upgrade is active try: self.verify_upgrade_not_in_progress()