Skip to content

Fix ironic rebalance race rocky #4

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
Nov 21, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 36 additions & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
---
language: python
python: "2.7"

# Run jobs in VMs - sudo is required by ansible tests.
sudo: required

# Install ansible
addons:
apt:
packages:
- gcc
- python-apt
- python-virtualenv
- realpath

# Create a build matrix for the different test jobs.
env:
matrix:
# Run python style checks.
- TOX_ENV=pep8
# Build documentation.
- TOX_ENV=docs
# Run python2.7 unit tests.
- TOX_ENV=py27
# Run functional tests.
- TOX_ENV=functional

install:
# Install tox in a virtualenv to ensure we have an up to date version.
- pip install -U pip
- pip install tox

script:
# Run the tox environment.
- tox -e ${TOX_ENV}
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
{
"hypervisors": [
{
"hypervisor_hostname": "fake-mini",
"hypervisor_hostname": "host2",
"id": "1bb62a04-c576-402c-8147-9e89757a09e3",
"state": "up",
"status": "enabled"
Expand Down
33 changes: 25 additions & 8 deletions nova/compute/manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -8033,13 +8033,16 @@ def update_available_resource(self, context, startup=False):
compute_nodes_in_db = self._get_compute_nodes_in_db(context,
use_slave=True,
startup=startup)

rt = self._get_resource_tracker()
rt.clean_compute_node_cache(compute_nodes_in_db)

try:
nodenames = set(self.driver.get_available_nodes())
except exception.VirtDriverNotReady:
LOG.warning("Virt driver is not ready.")
return

rt = self._get_resource_tracker()
# Delete orphan compute node not reported by driver but still in db
for cn in compute_nodes_in_db:
if cn.hypervisor_hostname not in nodenames:
Expand All @@ -8048,13 +8051,27 @@ def update_available_resource(self, context, startup=False):
"nodes are %(nodes)s",
{'id': cn.id, 'hh': cn.hypervisor_hostname,
'nodes': nodenames})
cn.destroy()
rt.remove_node(cn.hypervisor_hostname)
# Delete the corresponding resource provider in placement,
# along with any associated allocations and inventory.
# TODO(cdent): Move use of reportclient into resource tracker.
self.scheduler_client.reportclient.delete_resource_provider(
context, cn, cascade=True)
try:
cn.destroy()
except exception.ComputeHostNotFound:
# NOTE(mgoddard): it's possible that another compute
# service took ownership of this compute node since we
# queried it due to a rebalance, and this will cause the
# deletion to fail. Ignore the error in that case.
LOG.info("Ignoring failure to delete orphan compute node "
"%(id)s on hypervisor host %(hh)s due to "
"possible node rebalance",
{'id': cn.id, 'hh': cn.hypervisor_hostname})
rt.remove_node(cn.hypervisor_hostname)
else:
rt.remove_node(cn.hypervisor_hostname)
# Delete the corresponding resource provider in placement,
# along with any associated allocations.
# TODO(cdent): Move use of reportclient into resource
# tracker.
reportclient = self.scheduler_client.reportclient
reportclient.delete_resource_provider(
context, cn, cascade=True)

for nodename in nodenames:
self._update_available_resource_for_node(context, nodename)
Expand Down
17 changes: 17 additions & 0 deletions nova/compute/resource_tracker.py
Original file line number Diff line number Diff line change
Expand Up @@ -1547,3 +1547,20 @@ def build_failed(self, nodename):
def build_succeeded(self, nodename):
"""Resets the failed_builds stats for the given node."""
self.stats[nodename].build_succeeded()

@utils.synchronized(COMPUTE_RESOURCE_SEMAPHORE)
def clean_compute_node_cache(self, compute_nodes_in_db):
"""Clean the compute node cache of any nodes that no longer exist.

:param compute_nodes_in_db: list of ComputeNode objects from the DB.
"""
compute_nodes_in_db_nodenames = {cn.hypervisor_hostname
for cn in compute_nodes_in_db}
stale_cns = set(self.compute_nodes) - compute_nodes_in_db_nodenames

for stale_cn in stale_cns:
# NOTE(mgoddard): we have found a node in the cache that has no
# compute node in the DB. This could be due to a node rebalance
# where another compute service took ownership of the node. Clean
# up the cache.
self.remove_node(stale_cn)
5 changes: 3 additions & 2 deletions nova/db/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -330,15 +330,16 @@ def compute_node_update(context, compute_id, values):
return IMPL.compute_node_update(context, compute_id, values)


def compute_node_delete(context, compute_id):
def compute_node_delete(context, compute_id, compute_host):
"""Delete a compute node from the database.

:param context: The security context
:param compute_id: ID of the compute node
:param compute_host: Hostname of the compute service

Raises ComputeHostNotFound if compute node with the given ID doesn't exist.
"""
return IMPL.compute_node_delete(context, compute_id)
return IMPL.compute_node_delete(context, compute_id, compute_host)


def compute_node_statistics(context):
Expand Down
21 changes: 17 additions & 4 deletions nova/db/sqlalchemy/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -685,16 +685,29 @@ def compute_node_search_by_hypervisor(context, hypervisor_match):


@pick_context_manager_writer
def compute_node_create(context, values):
def _compute_node_create(context, values):
"""Creates a new ComputeNode and populates the capacity fields
with the most recent data.
"""
convert_objects_related_datetimes(values)

compute_node_ref = models.ComputeNode()
compute_node_ref.update(values)
compute_node_ref.save(context.session)
return compute_node_ref


# NOTE(mgoddard): We avoid decorating this with @pick_context_manager_writer,
# so that we get a separate transaction in the exception handler. This avoids
# an error message about inactive DB sessions during a transaction rollback.
# See https://bugs.launchpad.net/nova/+bug/1853159.
def compute_node_create(context, values):
"""Creates a new ComputeNode and populates the capacity fields
with the most recent data. Will restore a soft deleted compute node if a
UUID has been explicitly requested.
"""
try:
compute_node_ref.save(context.session)
compute_node_ref = _compute_node_create(context, values)
except db_exc.DBDuplicateEntry:
with excutils.save_and_reraise_exception(logger=LOG) as err_ctx:
# Check to see if we have a (soft) deleted ComputeNode with the
Expand Down Expand Up @@ -758,10 +771,10 @@ def compute_node_update(context, compute_id, values):


@pick_context_manager_writer
def compute_node_delete(context, compute_id):
def compute_node_delete(context, compute_id, compute_host):
"""Delete a ComputeNode record."""
result = model_query(context, models.ComputeNode).\
filter_by(id=compute_id).\
filter_by(id=compute_id, host=compute_host).\
soft_delete(synchronize_session=False)

if not result:
Expand Down
2 changes: 1 addition & 1 deletion nova/objects/compute_node.py
Original file line number Diff line number Diff line change
Expand Up @@ -337,7 +337,7 @@ def save(self, prune_stats=False):

@base.remotable
def destroy(self):
db.compute_node_delete(self._context, self.id)
db.compute_node_delete(self._context, self.id, self.host)

def update_from_virt_driver(self, resources):
# NOTE(pmurray): the virt driver provides a dict of values that
Expand Down
66 changes: 51 additions & 15 deletions nova/test.py
Original file line number Diff line number Diff line change
Expand Up @@ -419,30 +419,42 @@ def flags(self, **kw):

def start_service(self, name, host=None, **kwargs):
cell = None
# if the host is None then the CONF.host remains defaulted to
# 'fake-mini' (originally done in ConfFixture)
if host is not None:
# Make sure that CONF.host is relevant to the right hostname
self.useFixture(nova_fixtures.ConfPatcher(host=host))

if name == 'compute' and self.USES_DB:
# NOTE(danms): We need to create the HostMapping first, because
# otherwise we'll fail to update the scheduler while running
# the compute node startup routines below.
ctxt = context.get_context()
cell_name = kwargs.pop('cell', CELL1_NAME) or CELL1_NAME
cell = self.cell_mappings[cell_name]
hm = objects.HostMapping(context=ctxt,
host=host or name,
cell_mapping=cell)
hm.create()
self.host_mappings[hm.host] = hm
if host is not None:
# Make sure that CONF.host is relevant to the right hostname
self.useFixture(nova_fixtures.ConfPatcher(host=host))
if (host or name) not in self.host_mappings:
# NOTE(gibi): If the HostMapping does not exists then this is
# the first start of the service so we create the mapping.
hm = objects.HostMapping(context=ctxt,
host=host or name,
cell_mapping=cell)
hm.create()
self.host_mappings[hm.host] = hm
svc = self.useFixture(
nova_fixtures.ServiceFixture(name, host, cell=cell, **kwargs))

return svc.service

def restart_compute_service(self, compute):
"""Restart a compute service in a realistic way.
def restart_compute_service(self, compute, keep_hypervisor_state=True):
"""Stops the service and starts a new one to have realistic restart

:param:compute: the nova-compute service to be restarted
:param:keep_hypervisor_state: If true then already defined instances
will survive the compute service restart.
If false then the new service will see
an empty hypervisor
:returns: a new compute service instance serving the same host and
and node
"""

# NOTE(gibi): The service interface cannot be used to simulate a real
Expand All @@ -452,12 +464,36 @@ def restart_compute_service(self, compute):
# a stop start. The service.kill() call cannot help as it deletes
# the service from the DB which is unrealistic and causes that some
# operation that refers to the killed host (e.g. evacuate) fails.
# So this helper method tries to simulate a better compute service
# restart by cleaning up some of the internal state of the compute
# manager.
# So this helper method will stop the original service and then starts
# a brand new compute service for the same host and node. This way
# a new ComputeManager instance will be created and initialized during
# the service startup.
compute.stop()
compute.manager._resource_tracker = None
compute.start()

# this service was running previously so we have to make sure that
# we restart it in the same cell
cell_name = self.host_mappings[compute.host].cell_mapping.name

if keep_hypervisor_state:
# NOTE(gibi): FakeDriver does not provide a meaningful way to
# define some servers that exists already on the hypervisor when
# the driver is (re)created during the service startup. This means
# that we cannot simulate that the definition of a server
# survives a nova-compute service restart on the hypervisor.
# Instead here we save the FakeDriver instance that knows about
# the defined servers and inject that driver into the new Manager
# class during the startup of the compute service.
old_driver = compute.manager.driver
with mock.patch(
'nova.virt.driver.load_compute_driver') as load_driver:
load_driver.return_value = old_driver
new_compute = self.start_service(
'compute', host=compute.host, cell=cell_name)
else:
new_compute = self.start_service(
'compute', host=compute.host, cell=cell_name)

return new_compute

def assertJsonEqual(self, expected, observed, message=''):
"""Asserts that 2 complex data structures are json equivalent.
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
{
"hypervisors": [
{
"hypervisor_hostname": "fake-mini",
"hypervisor_hostname": "host2",
"id": "%(hypervisor_id)s",
"state": "up",
"status": "enabled"
Expand Down
5 changes: 0 additions & 5 deletions nova/tests/functional/api_sample_tests/test_hypervisors.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
from nova.cells import utils as cells_utils
from nova import objects
from nova.tests.functional.api_sample_tests import api_sample_base
from nova.virt import fake


class HypervisorsSampleJsonTests(api_sample_base.ApiSampleTestBaseV21):
Expand Down Expand Up @@ -157,8 +156,6 @@ def setUp(self):
# Start a new compute service to fake a record with hypervisor id=2
# for pagination test.
host = 'host1'
fake.set_nodes([host])
self.addCleanup(fake.restore_nodes)
self.start_service('compute', host=host)

def test_hypervisors_list(self):
Expand Down Expand Up @@ -205,8 +202,6 @@ def test_hypervisors_list(self):
def test_hypervisors_detail(self):
# Start another compute service to get a 2nd compute for paging tests.
host = 'host2'
fake.set_nodes([host])
self.addCleanup(fake.restore_nodes)
service_2 = self.start_service('compute', host=host).service_ref
compute_node_2 = service_2.compute_node
marker = self.compute_node_1.uuid
Expand Down
3 changes: 0 additions & 3 deletions nova/tests/functional/compute/test_live_migration.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@
from nova.tests.functional import integrated_helpers
from nova.tests.unit import fake_notifier
from nova.tests import uuidsentinel as uuids
from nova.virt import fake


class FakeCinderError(object):
Expand Down Expand Up @@ -53,8 +52,6 @@ def setUp(self):
# Start a second compte node (the first one was started for us by
# _IntegratedTestBase. set_nodes() is needed to avoid duplicate
# nodenames. See comments in test_bug_1702454.py.
fake.set_nodes(['host2'])
self.addCleanup(fake.restore_nodes)
self.compute2 = self.start_service('compute', host='host2')

# To get the old Cinder flow we need to hack the service version, otherwise
Expand Down
3 changes: 0 additions & 3 deletions nova/tests/functional/integrated_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,6 @@
import nova.tests.unit.image.fake
from nova.tests.unit import policy_fixture
from nova.tests import uuidsentinel as uuids
from nova.virt import fake


CONF = nova.conf.CONF
Expand Down Expand Up @@ -396,8 +395,6 @@ def _start_compute(self, host, cell_name=None):
compute service (defaults to cell1)
:return: the nova compute service object
"""
fake.set_nodes([host])
self.addCleanup(fake.restore_nodes)
compute = self.start_service('compute', host=host, cell=cell_name)
self.computes[host] = compute
return compute
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@
from nova.tests.functional.notification_sample_tests \
import notification_sample_base
from nova.tests.unit import fake_notifier
from nova.virt import fake

COMPUTE_VERSION_OLD_ATTACH_FLOW = \
compute_api.CINDER_V3_ATTACH_MIN_COMPUTE_VERSION - 1
Expand All @@ -48,8 +47,6 @@ def test_multiple_compute_actions(self):
self._wait_for_notification('instance.create.end')
self._attach_volume_to_server(server, self.cinder.SWAP_OLD_VOL)
# server will boot on the 'compute' host
fake.set_nodes(['host2'])
self.addCleanup(fake.restore_nodes)
self.compute2 = self.start_service('compute', host='host2')

actions = [
Expand Down
3 changes: 0 additions & 3 deletions nova/tests/functional/regressions/test_bug_1669054.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
from nova import context
from nova import objects
from nova.tests.functional import integrated_helpers
from nova.virt import fake


class ResizeEvacuateTestCase(integrated_helpers._IntegratedTestBase,
Expand Down Expand Up @@ -47,8 +46,6 @@ def test_resize_then_evacuate(self):
self._wait_for_state_change(self.api, server, 'ACTIVE')

# Start up another compute service so we can resize.
fake.set_nodes(['host2'])
self.addCleanup(fake.restore_nodes)
host2 = self.start_service('compute', host='host2')

# Now resize the server to move it to host2.
Expand Down
Loading