stackhpc · markgoddard · Nov 21, 2019 · May 1, 2019 · May 1, 2019 · Nov 19, 2019
diff --git a/.travis.yml b/.travis.yml
@@ -0,0 +1,36 @@
+---
+language: python
+python: "2.7"
+
+# Run jobs in VMs - sudo is required by ansible tests.
+sudo: required
+
+# Install ansible
+addons:
+  apt:
+    packages:
+      - gcc
+      - python-apt
+      - python-virtualenv
+      - realpath
+
+# Create a build matrix for the different test jobs.
+env:
+  matrix:
+    # Run python style checks.
+    - TOX_ENV=pep8
+    # Build documentation.
+    - TOX_ENV=docs
+    # Run python2.7 unit tests.
+    - TOX_ENV=py27
+    # Run functional tests.
+    - TOX_ENV=functional
+
+install:
+  # Install tox in a virtualenv to ensure we have an up to date version.
+  - pip install -U pip
+  - pip install tox
+
+script:
+  # Run the tox environment.
+  - tox -e ${TOX_ENV}
diff --git a/doc/api_samples/os-hypervisors/v2.53/hypervisors-list-resp.json b/doc/api_samples/os-hypervisors/v2.53/hypervisors-list-resp.json
@@ -1,7 +1,7 @@
 {
     "hypervisors": [
         {
-            "hypervisor_hostname": "fake-mini",
+            "hypervisor_hostname": "host2",
             "id": "1bb62a04-c576-402c-8147-9e89757a09e3",
             "state": "up",
             "status": "enabled"

diff --git a/nova/compute/manager.py b/nova/compute/manager.py
@@ -8033,13 +8033,16 @@ def update_available_resource(self, context, startup=False):
         compute_nodes_in_db = self._get_compute_nodes_in_db(context,
                                                             use_slave=True,
                                                             startup=startup)
+
+        rt = self._get_resource_tracker()
+        rt.clean_compute_node_cache(compute_nodes_in_db)
+
         try:
             nodenames = set(self.driver.get_available_nodes())
         except exception.VirtDriverNotReady:
             LOG.warning("Virt driver is not ready.")
             return
 
-        rt = self._get_resource_tracker()
         # Delete orphan compute node not reported by driver but still in db
         for cn in compute_nodes_in_db:
             if cn.hypervisor_hostname not in nodenames:
@@ -8048,13 +8051,27 @@ def update_available_resource(self, context, startup=False):
                          "nodes are %(nodes)s",
                          {'id': cn.id, 'hh': cn.hypervisor_hostname,
                           'nodes': nodenames})
-                cn.destroy()
-                rt.remove_node(cn.hypervisor_hostname)
-                # Delete the corresponding resource provider in placement,
-                # along with any associated allocations and inventory.
-                # TODO(cdent): Move use of reportclient into resource tracker.
-                self.scheduler_client.reportclient.delete_resource_provider(
-                    context, cn, cascade=True)
+                try:
+                    cn.destroy()
+                except exception.ComputeHostNotFound:
+                    # NOTE(mgoddard): it's possible that another compute
+                    # service took ownership of this compute node since we
+                    # queried it due to a rebalance, and this will cause the
+                    # deletion to fail. Ignore the error in that case.
+                    LOG.info("Ignoring failure to delete orphan compute node "
+                             "%(id)s on hypervisor host %(hh)s due to "
+                             "possible node rebalance",
+                             {'id': cn.id, 'hh': cn.hypervisor_hostname})
+                    rt.remove_node(cn.hypervisor_hostname)
+                else:
+                    rt.remove_node(cn.hypervisor_hostname)
+                    # Delete the corresponding resource provider in placement,
+                    # along with any associated allocations.
+                    # TODO(cdent): Move use of reportclient into resource
+                    # tracker.
+                    reportclient = self.scheduler_client.reportclient
+                    reportclient.delete_resource_provider(
+                        context, cn, cascade=True)
 
         for nodename in nodenames:
             self._update_available_resource_for_node(context, nodename)

diff --git a/nova/compute/resource_tracker.py b/nova/compute/resource_tracker.py
@@ -1547,3 +1547,20 @@ def build_failed(self, nodename):
     def build_succeeded(self, nodename):
         """Resets the failed_builds stats for the given node."""
         self.stats[nodename].build_succeeded()
+
+    @utils.synchronized(COMPUTE_RESOURCE_SEMAPHORE)
+    def clean_compute_node_cache(self, compute_nodes_in_db):
+        """Clean the compute node cache of any nodes that no longer exist.
+
+        :param compute_nodes_in_db: list of ComputeNode objects from the DB.
+        """
+        compute_nodes_in_db_nodenames = {cn.hypervisor_hostname
+                                         for cn in compute_nodes_in_db}
+        stale_cns = set(self.compute_nodes) - compute_nodes_in_db_nodenames
+
+        for stale_cn in stale_cns:
+            # NOTE(mgoddard): we have found a node in the cache that has no
+            # compute node in the DB. This could be due to a node rebalance
+            # where another compute service took ownership of the node. Clean
+            # up the cache.
+            self.remove_node(stale_cn)
diff --git a/nova/db/api.py b/nova/db/api.py
@@ -330,15 +330,16 @@ def compute_node_update(context, compute_id, values):
     return IMPL.compute_node_update(context, compute_id, values)
 
 
-def compute_node_delete(context, compute_id):
+def compute_node_delete(context, compute_id, compute_host):
     """Delete a compute node from the database.
 
     :param context: The security context
     :param compute_id: ID of the compute node
+    :param compute_host: Hostname of the compute service
 
     Raises ComputeHostNotFound if compute node with the given ID doesn't exist.
     """
-    return IMPL.compute_node_delete(context, compute_id)
+    return IMPL.compute_node_delete(context, compute_id, compute_host)
 
 
 def compute_node_statistics(context):

diff --git a/nova/db/sqlalchemy/api.py b/nova/db/sqlalchemy/api.py
@@ -685,16 +685,29 @@ def compute_node_search_by_hypervisor(context, hypervisor_match):
 
 
 @pick_context_manager_writer
-def compute_node_create(context, values):
+def _compute_node_create(context, values):
     """Creates a new ComputeNode and populates the capacity fields
     with the most recent data.
     """
     convert_objects_related_datetimes(values)
 
     compute_node_ref = models.ComputeNode()
     compute_node_ref.update(values)
+    compute_node_ref.save(context.session)
+    return compute_node_ref
+
+
+# NOTE(mgoddard): We avoid decorating this with @pick_context_manager_writer,
+# so that we get a separate transaction in the exception handler. This avoids
+# an error message about inactive DB sessions during a transaction rollback.
+# See https://bugs.launchpad.net/nova/+bug/1853159.
+def compute_node_create(context, values):
+    """Creates a new ComputeNode and populates the capacity fields
+    with the most recent data. Will restore a soft deleted compute node if a
+    UUID has been explicitly requested.
+    """
     try:
-        compute_node_ref.save(context.session)
+        compute_node_ref = _compute_node_create(context, values)
     except db_exc.DBDuplicateEntry:
         with excutils.save_and_reraise_exception(logger=LOG) as err_ctx:
             # Check to see if we have a (soft) deleted ComputeNode with the
@@ -758,10 +771,10 @@ def compute_node_update(context, compute_id, values):
 
 
 @pick_context_manager_writer
-def compute_node_delete(context, compute_id):
+def compute_node_delete(context, compute_id, compute_host):
     """Delete a ComputeNode record."""
     result = model_query(context, models.ComputeNode).\
-             filter_by(id=compute_id).\
+             filter_by(id=compute_id, host=compute_host).\
              soft_delete(synchronize_session=False)
 
     if not result:

diff --git a/nova/objects/compute_node.py b/nova/objects/compute_node.py
@@ -337,7 +337,7 @@ def save(self, prune_stats=False):
 
     @base.remotable
     def destroy(self):
-        db.compute_node_delete(self._context, self.id)
+        db.compute_node_delete(self._context, self.id, self.host)
 
     def update_from_virt_driver(self, resources):
         # NOTE(pmurray): the virt driver provides a dict of values that

diff --git a/nova/test.py b/nova/test.py
@@ -419,30 +419,42 @@ def flags(self, **kw):
 
     def start_service(self, name, host=None, **kwargs):
         cell = None
+        # if the host is None then the CONF.host remains defaulted to
+        # 'fake-mini' (originally done in ConfFixture)
+        if host is not None:
+            # Make sure that CONF.host is relevant to the right hostname
+            self.useFixture(nova_fixtures.ConfPatcher(host=host))
+
         if name == 'compute' and self.USES_DB:
             # NOTE(danms): We need to create the HostMapping first, because
             # otherwise we'll fail to update the scheduler while running
             # the compute node startup routines below.
             ctxt = context.get_context()
             cell_name = kwargs.pop('cell', CELL1_NAME) or CELL1_NAME
             cell = self.cell_mappings[cell_name]
-            hm = objects.HostMapping(context=ctxt,
-                                     host=host or name,
-                                     cell_mapping=cell)
-            hm.create()
-            self.host_mappings[hm.host] = hm
-            if host is not None:
-                # Make sure that CONF.host is relevant to the right hostname
-                self.useFixture(nova_fixtures.ConfPatcher(host=host))
+            if (host or name) not in self.host_mappings:
+                # NOTE(gibi): If the HostMapping does not exists then this is
+                # the first start of the service so we create the mapping.
+                hm = objects.HostMapping(context=ctxt,
+                                         host=host or name,
+                                         cell_mapping=cell)
+                hm.create()
+                self.host_mappings[hm.host] = hm
         svc = self.useFixture(
             nova_fixtures.ServiceFixture(name, host, cell=cell, **kwargs))
 
         return svc.service
 
-    def restart_compute_service(self, compute):
-        """Restart a compute service in a realistic way.
+    def restart_compute_service(self, compute, keep_hypervisor_state=True):
+        """Stops the service and starts a new one to have realistic restart
 
         :param:compute: the nova-compute service to be restarted
+        :param:keep_hypervisor_state: If true then already defined instances
+                                      will survive the compute service restart.
+                                      If false then the new service will see
+                                      an empty hypervisor
+        :returns: a new compute service instance serving the same host and
+                  and node
         """
 
         # NOTE(gibi): The service interface cannot be used to simulate a real
@@ -452,12 +464,36 @@ def restart_compute_service(self, compute):
         # a stop start. The service.kill() call cannot help as it deletes
         # the service from the DB which is unrealistic and causes that some
         # operation that refers to the killed host (e.g. evacuate) fails.
-        # So this helper method tries to simulate a better compute service
-        # restart by cleaning up some of the internal state of the compute
-        # manager.
+        # So this helper method will stop the original service and then starts
+        # a brand new compute service for the same host and node. This way
+        # a new ComputeManager instance will be created and initialized during
+        # the service startup.
         compute.stop()
-        compute.manager._resource_tracker = None
-        compute.start()
+
+        # this service was running previously so we have to make sure that
+        # we restart it in the same cell
+        cell_name = self.host_mappings[compute.host].cell_mapping.name
+
+        if keep_hypervisor_state:
+            # NOTE(gibi): FakeDriver does not provide a meaningful way to
+            # define some servers that exists already on the hypervisor when
+            # the driver is (re)created during the service startup. This means
+            # that we cannot simulate that the definition of a server
+            # survives a nova-compute service restart on the hypervisor.
+            # Instead here we save the FakeDriver instance that knows about
+            # the defined servers and inject that driver into the new Manager
+            # class during the startup of the compute service.
+            old_driver = compute.manager.driver
+            with mock.patch(
+                    'nova.virt.driver.load_compute_driver') as load_driver:
+                load_driver.return_value = old_driver
+                new_compute = self.start_service(
+                    'compute', host=compute.host, cell=cell_name)
+        else:
+            new_compute = self.start_service(
+                'compute', host=compute.host, cell=cell_name)
+
+        return new_compute
 
     def assertJsonEqual(self, expected, observed, message=''):
         """Asserts that 2 complex data structures are json equivalent.

diff --git a/...nctional/api_sample_tests/api_samples/os-hypervisors/v2.53/hypervisors-list-resp.json.tpl b/...nctional/api_sample_tests/api_samples/os-hypervisors/v2.53/hypervisors-list-resp.json.tpl
@@ -1,7 +1,7 @@
 {
     "hypervisors": [
         {
-            "hypervisor_hostname": "fake-mini",
+            "hypervisor_hostname": "host2",
             "id": "%(hypervisor_id)s",
             "state": "up",
             "status": "enabled"

diff --git a/nova/tests/functional/api_sample_tests/test_hypervisors.py b/nova/tests/functional/api_sample_tests/test_hypervisors.py
@@ -18,7 +18,6 @@
 from nova.cells import utils as cells_utils
 from nova import objects
 from nova.tests.functional.api_sample_tests import api_sample_base
-from nova.virt import fake
 
 
 class HypervisorsSampleJsonTests(api_sample_base.ApiSampleTestBaseV21):
@@ -157,8 +156,6 @@ def setUp(self):
         # Start a new compute service to fake a record with hypervisor id=2
         # for pagination test.
         host = 'host1'
-        fake.set_nodes([host])
-        self.addCleanup(fake.restore_nodes)
         self.start_service('compute', host=host)
 
     def test_hypervisors_list(self):
@@ -205,8 +202,6 @@ def test_hypervisors_list(self):
     def test_hypervisors_detail(self):
         # Start another compute service to get a 2nd compute for paging tests.
         host = 'host2'
-        fake.set_nodes([host])
-        self.addCleanup(fake.restore_nodes)
         service_2 = self.start_service('compute', host=host).service_ref
         compute_node_2 = service_2.compute_node
         marker = self.compute_node_1.uuid

diff --git a/nova/tests/functional/compute/test_live_migration.py b/nova/tests/functional/compute/test_live_migration.py
@@ -20,7 +20,6 @@
 from nova.tests.functional import integrated_helpers
 from nova.tests.unit import fake_notifier
 from nova.tests import uuidsentinel as uuids
-from nova.virt import fake
 
 
 class FakeCinderError(object):
@@ -53,8 +52,6 @@ def setUp(self):
         # Start a second compte node (the first one was started for us by
         # _IntegratedTestBase. set_nodes() is needed to avoid duplicate
         # nodenames. See comments in test_bug_1702454.py.
-        fake.set_nodes(['host2'])
-        self.addCleanup(fake.restore_nodes)
         self.compute2 = self.start_service('compute', host='host2')
 
     # To get the old Cinder flow we need to hack the service version, otherwise

diff --git a/nova/tests/functional/integrated_helpers.py b/nova/tests/functional/integrated_helpers.py
@@ -35,7 +35,6 @@
 import nova.tests.unit.image.fake
 from nova.tests.unit import policy_fixture
 from nova.tests import uuidsentinel as uuids
-from nova.virt import fake
 
 
 CONF = nova.conf.CONF
@@ -396,8 +395,6 @@ def _start_compute(self, host, cell_name=None):
                           compute service (defaults to cell1)
         :return: the nova compute service object
         """
-        fake.set_nodes([host])
-        self.addCleanup(fake.restore_nodes)
         compute = self.start_service('compute', host=host, cell=cell_name)
         self.computes[host] = compute
         return compute

diff --git a/nova/tests/functional/notification_sample_tests/test_instance.py b/nova/tests/functional/notification_sample_tests/test_instance.py
@@ -22,7 +22,6 @@
 from nova.tests.functional.notification_sample_tests \
     import notification_sample_base
 from nova.tests.unit import fake_notifier
-from nova.virt import fake
 
 COMPUTE_VERSION_OLD_ATTACH_FLOW = \
     compute_api.CINDER_V3_ATTACH_MIN_COMPUTE_VERSION - 1
@@ -48,8 +47,6 @@ def test_multiple_compute_actions(self):
         self._wait_for_notification('instance.create.end')
         self._attach_volume_to_server(server, self.cinder.SWAP_OLD_VOL)
         # server will boot on the 'compute' host
-        fake.set_nodes(['host2'])
-        self.addCleanup(fake.restore_nodes)
         self.compute2 = self.start_service('compute', host='host2')
 
         actions = [

diff --git a/nova/tests/functional/regressions/test_bug_1669054.py b/nova/tests/functional/regressions/test_bug_1669054.py
@@ -13,7 +13,6 @@
 from nova import context
 from nova import objects
 from nova.tests.functional import integrated_helpers
-from nova.virt import fake
 
 
 class ResizeEvacuateTestCase(integrated_helpers._IntegratedTestBase,
@@ -47,8 +46,6 @@ def test_resize_then_evacuate(self):
         self._wait_for_state_change(self.api, server, 'ACTIVE')
 
         # Start up another compute service so we can resize.
-        fake.set_nodes(['host2'])
-        self.addCleanup(fake.restore_nodes)
         host2 = self.start_service('compute', host='host2')
 
         # Now resize the server to move it to host2.