- properly handles big-endian data in `iputils.py` (Closes: #1057031). Signed-off-by: Daniel Baumann <daniel@debian.org>
858 lines
34 KiB
Python
858 lines
34 KiB
Python
# Copyright (c) 2022, Dell Inc. or its subsidiaries. All rights reserved.
|
|
# SPDX-License-Identifier: Apache-2.0
|
|
# See the LICENSE file for details.
|
|
#
|
|
# This file is part of NVMe STorage Appliance Services (nvme-stas).
|
|
#
|
|
# Authors: Martin Belanger <Martin.Belanger@dell.com>
|
|
#
|
|
'''This module defines the base Controller object from which the
|
|
Dc (Discovery Controller) and Ioc (I/O Controller) objects are derived.'''
|
|
|
|
import time
|
|
import inspect
|
|
import logging
|
|
from gi.repository import GLib
|
|
from libnvme import nvme
|
|
from staslib import conf, defs, gutil, trid, udev, stas
|
|
|
|
|
|
DLP_CHANGED = (
|
|
(nvme.NVME_LOG_LID_DISCOVER << 16) | (nvme.NVME_AER_NOTICE_DISC_CHANGED << 8) | nvme.NVME_AER_NOTICE
|
|
) # 0x70f002
|
|
|
|
|
|
def get_eflags(dlpe):
|
|
'''@brief Return eflags field of dlpe'''
|
|
return int(dlpe.get('eflags', 0)) if dlpe else 0
|
|
|
|
|
|
def get_ncc(eflags: int):
|
|
'''@brief Return True if Not Connected to CDC bit is asserted, False otherwise'''
|
|
return eflags & nvme.NVMF_DISC_EFLAGS_NCC != 0
|
|
|
|
|
|
def dlp_supp_opts_as_string(dlp_supp_opts: int):
|
|
'''@brief Return the list of options supported by the Get
|
|
discovery log page command.
|
|
'''
|
|
data = {
|
|
nvme.NVMF_LOG_DISC_LID_EXTDLPES: "EXTDLPES",
|
|
nvme.NVMF_LOG_DISC_LID_PLEOS: "PLEOS",
|
|
nvme.NVMF_LOG_DISC_LID_ALLSUBES: "ALLSUBES",
|
|
}
|
|
return [txt for msk, txt in data.items() if dlp_supp_opts & msk]
|
|
|
|
|
|
# ******************************************************************************
|
|
class Controller(stas.ControllerABC): # pylint: disable=too-many-instance-attributes
|
|
'''@brief Base class used to manage the connection to a controller.'''
|
|
|
|
def __init__(self, tid: trid.TID, service, discovery_ctrl: bool = False):
|
|
sysconf = conf.SysConf()
|
|
self._nvme_options = conf.NvmeOptions()
|
|
self._root = nvme.root()
|
|
self._host = nvme.host(
|
|
self._root, hostnqn=sysconf.hostnqn, hostid=sysconf.hostid, hostsymname=sysconf.hostsymname
|
|
)
|
|
self._host.dhchap_key = sysconf.hostkey if self._nvme_options.dhchap_hostkey_supp else None
|
|
self._udev = udev.UDEV
|
|
self._device = None # Refers to the nvme device (e.g. /dev/nvme[n])
|
|
self._ctrl = None # libnvme's nvme.ctrl object
|
|
self._connect_op = None
|
|
|
|
super().__init__(tid, service, discovery_ctrl)
|
|
|
|
def _release_resources(self):
|
|
logging.debug('Controller._release_resources() - %s | %s', self.id, self.device)
|
|
|
|
if self._udev:
|
|
self._udev.unregister_for_device_events(self._on_udev_notification)
|
|
|
|
self._kill_ops()
|
|
|
|
super()._release_resources()
|
|
|
|
self._ctrl = None
|
|
self._udev = None
|
|
self._host = None
|
|
self._root = None
|
|
self._nvme_options = None
|
|
|
|
@property
|
|
def device(self) -> str:
|
|
'''@brief return the Linux nvme device id (e.g. nvme3) or empty
|
|
string if no device is associated with this controller'''
|
|
if not self._device and self._ctrl and self._ctrl.name:
|
|
self._device = self._ctrl.name
|
|
|
|
return self._device or 'nvme?'
|
|
|
|
def all_ops_completed(self) -> bool:
|
|
'''@brief Returns True if all operations have completed. False otherwise.'''
|
|
return self._connect_op is None or self._connect_op.completed()
|
|
|
|
def connected(self):
|
|
'''@brief Return whether a connection is established'''
|
|
return self._ctrl and self._ctrl.connected()
|
|
|
|
def controller_id_dict(self) -> dict:
|
|
'''@brief return the controller ID as a dict.'''
|
|
cid = super().controller_id_dict()
|
|
cid['device'] = self.device
|
|
return cid
|
|
|
|
def details(self) -> dict:
|
|
'''@brief return detailed debug info about this controller'''
|
|
details = super().details()
|
|
details.update(
|
|
self._udev.get_attributes(self.device, ('hostid', 'hostnqn', 'model', 'serial', 'dctype', 'cntrltype'))
|
|
)
|
|
details['connected'] = str(self.connected())
|
|
return details
|
|
|
|
def info(self) -> dict:
|
|
'''@brief Get the controller info for this object'''
|
|
info = super().info()
|
|
if self._connect_op:
|
|
info['connect operation'] = str(self._connect_op.as_dict())
|
|
return info
|
|
|
|
def cancel(self):
|
|
'''@brief Used to cancel pending operations.'''
|
|
super().cancel()
|
|
if self._connect_op:
|
|
self._connect_op.cancel()
|
|
|
|
def _kill_ops(self):
|
|
if self._connect_op:
|
|
self._connect_op.kill()
|
|
self._connect_op = None
|
|
|
|
def set_level_from_tron(self, tron):
|
|
'''Set log level based on TRON'''
|
|
if self._root:
|
|
self._root.log_level("debug" if tron else "err")
|
|
|
|
def _on_udev_notification(self, udev_obj):
|
|
if not self._alive():
|
|
logging.debug(
|
|
'Controller._on_udev_notification() - %s | %s: Received event on dead object. udev_obj %s: %s',
|
|
self.id,
|
|
self.device,
|
|
udev_obj.action,
|
|
udev_obj.sys_name,
|
|
)
|
|
return
|
|
|
|
if udev_obj.action == 'change':
|
|
nvme_aen = udev_obj.get('NVME_AEN')
|
|
nvme_event = udev_obj.get('NVME_EVENT')
|
|
if isinstance(nvme_aen, str):
|
|
logging.info('%s | %s - Received AEN: %s', self.id, udev_obj.sys_name, nvme_aen)
|
|
self._on_aen(int(nvme_aen, 16))
|
|
if isinstance(nvme_event, str):
|
|
self._on_nvme_event(nvme_event)
|
|
elif udev_obj.action == 'remove':
|
|
logging.info('%s | %s - Received "remove" event', self.id, udev_obj.sys_name)
|
|
self._on_ctrl_removed(udev_obj)
|
|
else:
|
|
logging.debug(
|
|
'Controller._on_udev_notification() - %s | %s: Received "%s" event',
|
|
self.id,
|
|
udev_obj.sys_name,
|
|
udev_obj.action,
|
|
)
|
|
|
|
def _on_ctrl_removed(self, udev_obj): # pylint: disable=unused-argument
|
|
if self._udev:
|
|
self._udev.unregister_for_device_events(self._on_udev_notification)
|
|
self._kill_ops() # Kill all pending operations
|
|
self._ctrl = None
|
|
self._device = None
|
|
self._connect_attempts = 0
|
|
self._retry_connect_tmr.start()
|
|
|
|
def _get_cfg(self):
|
|
'''Get configuration parameters. These may either come from the [Global]
|
|
section or from a "controller" entry in the configuration file. A
|
|
definition found in a "controller" entry overrides the same definition
|
|
found in the [Global] section.
|
|
'''
|
|
cfg = {}
|
|
service_conf = conf.SvcConf()
|
|
for option, keyword in (
|
|
('kato', 'keep_alive_tmo'),
|
|
('queue-size', 'queue_size'),
|
|
('hdr-digest', 'hdr_digest'),
|
|
('data-digest', 'data_digest'),
|
|
('nr-io-queues', 'nr_io_queues'),
|
|
('ctrl-loss-tmo', 'ctrl_loss_tmo'),
|
|
('disable-sqflow', 'disable_sqflow'),
|
|
('nr-poll-queues', 'nr_poll_queues'),
|
|
('nr-write-queues', 'nr_write_queues'),
|
|
('reconnect-delay', 'reconnect_delay'),
|
|
):
|
|
# Check if the value is defined as a "controller" entry (i.e. override)
|
|
ovrd_val = self.tid.cfg.get(option, None)
|
|
if ovrd_val is not None:
|
|
cfg[keyword] = ovrd_val
|
|
else:
|
|
# Check if the value is found in the [Global] section.
|
|
glob_val = service_conf.get_option('Global', option)
|
|
if glob_val is not None:
|
|
cfg[keyword] = glob_val
|
|
|
|
return cfg
|
|
|
|
def _do_connect(self):
|
|
service_conf = conf.SvcConf()
|
|
host_iface = (
|
|
self.tid.host_iface
|
|
if (self.tid.host_iface and not service_conf.ignore_iface and self._nvme_options.host_iface_supp)
|
|
else None
|
|
)
|
|
self._ctrl = nvme.ctrl(
|
|
self._root,
|
|
subsysnqn=self.tid.subsysnqn,
|
|
transport=self.tid.transport,
|
|
traddr=self.tid.traddr,
|
|
trsvcid=self.tid.trsvcid if self.tid.trsvcid else None,
|
|
host_traddr=self.tid.host_traddr if self.tid.host_traddr else None,
|
|
host_iface=host_iface,
|
|
)
|
|
self._ctrl.discovery_ctrl_set(self._discovery_ctrl)
|
|
|
|
# Set the DHCHAP key on the controller
|
|
# NOTE that this will eventually have to
|
|
# change once we have support for AVE (TP8019)
|
|
ctrl_dhchap_key = self.tid.cfg.get('dhchap-ctrl-secret')
|
|
if ctrl_dhchap_key and self._nvme_options.dhchap_ctrlkey_supp:
|
|
has_dhchap_key = hasattr(self._ctrl, 'dhchap_key')
|
|
if not has_dhchap_key:
|
|
logging.warning(
|
|
'%s | %s - libnvme-%s does not allow setting the controller DHCHAP key. Please upgrade libnvme.',
|
|
self.id,
|
|
self.device,
|
|
defs.LIBNVME_VERSION,
|
|
)
|
|
else:
|
|
self._ctrl.dhchap_key = ctrl_dhchap_key
|
|
|
|
# Audit existing nvme devices. If we find a match, then
|
|
# we'll just borrow that device instead of creating a new one.
|
|
udev_obj = self._find_existing_connection()
|
|
if udev_obj is not None:
|
|
# A device already exists.
|
|
self._device = udev_obj.sys_name
|
|
logging.debug(
|
|
'Controller._do_connect() - %s Found existing control device: %s', self.id, udev_obj.sys_name
|
|
)
|
|
self._connect_op = gutil.AsyncTask(
|
|
self._on_connect_success, self._on_connect_fail, self._ctrl.init, self._host, int(udev_obj.sys_number)
|
|
)
|
|
else:
|
|
cfg = self._get_cfg()
|
|
logging.debug(
|
|
'Controller._do_connect() - %s Connecting to nvme control with cfg=%s', self.id, cfg
|
|
)
|
|
self._connect_op = gutil.AsyncTask(
|
|
self._on_connect_success, self._on_connect_fail, self._ctrl.connect, self._host, cfg
|
|
)
|
|
|
|
self._connect_op.run_async()
|
|
|
|
# --------------------------------------------------------------------------
|
|
def _on_connect_success(self, op_obj: gutil.AsyncTask, data):
|
|
'''@brief Function called when we successfully connect to the
|
|
Controller.
|
|
'''
|
|
op_obj.kill()
|
|
self._connect_op = None
|
|
|
|
if not self._alive():
|
|
logging.debug(
|
|
'Controller._on_connect_success() - %s | %s: Received event on dead object. data=%s',
|
|
self.id,
|
|
self.device,
|
|
data,
|
|
)
|
|
return
|
|
|
|
self._device = self._ctrl.name
|
|
logging.info('%s | %s - Connection established!', self.id, self.device)
|
|
self._connect_attempts = 0
|
|
self._udev.register_for_device_events(self._device, self._on_udev_notification)
|
|
|
|
def _on_connect_fail(self, op_obj: gutil.AsyncTask, err, fail_cnt): # pylint: disable=unused-argument
|
|
'''@brief Function called when we fail to connect to the Controller.'''
|
|
op_obj.kill()
|
|
self._connect_op = None
|
|
|
|
if not self._alive():
|
|
logging.debug(
|
|
'Controller._on_connect_fail() - %s Received event on dead object. %s %s',
|
|
self.id,
|
|
err.domain,
|
|
err.message,
|
|
)
|
|
return
|
|
|
|
if self._connect_attempts == 1:
|
|
# Do a fast re-try on the first failure.
|
|
self._retry_connect_tmr.set_timeout(self.FAST_CONNECT_RETRY_PERIOD_SEC)
|
|
elif self._connect_attempts == 2:
|
|
# If the fast connect re-try fails, then we can print a message to
|
|
# indicate the failure, and start a slow re-try period.
|
|
self._retry_connect_tmr.set_timeout(self.CONNECT_RETRY_PERIOD_SEC)
|
|
logging.error('%s Failed to connect to controller. %s %s', self.id, err.domain, err.message)
|
|
|
|
if self._should_try_to_reconnect():
|
|
logging.debug(
|
|
'Controller._on_connect_fail() - %s %s. Retry in %s sec.',
|
|
self.id,
|
|
err,
|
|
self._retry_connect_tmr.get_timeout(),
|
|
)
|
|
self._retry_connect_tmr.start()
|
|
|
|
def disconnect(self, disconnected_cb, keep_connection):
|
|
'''@brief Issue an asynchronous disconnect command to a Controller.
|
|
Once the async command has completed, the callback 'disconnected_cb'
|
|
will be invoked. If a controller is already disconnected, then the
|
|
callback will be added to the main loop's next idle slot to be executed
|
|
ASAP.
|
|
|
|
@param disconnected_cb: Callback to be called when disconnect has
|
|
completed. the callback must have this signature:
|
|
def cback(controller: Controller, success: bool)
|
|
@param keep_connection: Whether the underlying connection should remain
|
|
in the kernel.
|
|
'''
|
|
logging.debug(
|
|
'Controller.disconnect() - %s | %s: keep_connection=%s', self.id, self.device, keep_connection
|
|
)
|
|
if self._ctrl and self._ctrl.connected() and not keep_connection:
|
|
logging.info('%s | %s - Disconnect initiated', self.id, self.device)
|
|
op = gutil.AsyncTask(self._on_disconn_success, self._on_disconn_fail, self._ctrl.disconnect)
|
|
op.run_async(disconnected_cb)
|
|
else:
|
|
# Defer callback to the next main loop's idle period. The callback
|
|
# cannot be called directly as the current Controller object is in the
|
|
# process of being disconnected and the callback will in fact delete
|
|
# the object. This would invariably lead to unpredictable outcome.
|
|
GLib.idle_add(disconnected_cb, self, True)
|
|
|
|
def _on_disconn_success(self, op_obj: gutil.AsyncTask, data, disconnected_cb): # pylint: disable=unused-argument
|
|
logging.debug('Controller._on_disconn_success() - %s | %s', self.id, self.device)
|
|
op_obj.kill()
|
|
# Defer callback to the next main loop's idle period. The callback
|
|
# cannot be called directly as the current Controller object is in the
|
|
# process of being disconnected and the callback will in fact delete
|
|
# the object. This would invariably lead to unpredictable outcome.
|
|
GLib.idle_add(disconnected_cb, self, True)
|
|
|
|
def _on_disconn_fail(
|
|
self, op_obj: gutil.AsyncTask, err, fail_cnt, disconnected_cb
|
|
): # pylint: disable=unused-argument
|
|
logging.debug('Controller._on_disconn_fail() - %s | %s: %s', self.id, self.device, err)
|
|
op_obj.kill()
|
|
# Defer callback to the next main loop's idle period. The callback
|
|
# cannot be called directly as the current Controller object is in the
|
|
# process of being disconnected and the callback will in fact delete
|
|
# the object. This would invariably lead to unpredictable outcome.
|
|
GLib.idle_add(disconnected_cb, self, False)
|
|
|
|
|
|
# ******************************************************************************
|
|
class Dc(Controller):
|
|
'''@brief This object establishes a connection to one Discover Controller (DC).
|
|
It retrieves the discovery log pages and caches them.
|
|
It also monitors udev events associated with that DC and updates
|
|
the cached discovery log pages accordingly.
|
|
'''
|
|
|
|
GET_LOG_PAGE_RETRY_RERIOD_SEC = 20
|
|
REGISTRATION_RETRY_RERIOD_SEC = 5
|
|
GET_SUPPORTED_RETRY_RERIOD_SEC = 5
|
|
|
|
def __init__(self, staf, tid: trid.TID, log_pages=None, origin=None):
|
|
super().__init__(tid, staf, discovery_ctrl=True)
|
|
self._register_op = None
|
|
self._get_supported_op = None
|
|
self._get_log_op = None
|
|
self._origin = origin
|
|
self._log_pages = log_pages if log_pages else list() # Log pages cache
|
|
|
|
# For Avahi-discovered DCs that later become unresponsive, monitor how
|
|
# long the controller remains unresponsive and if it does not return for
|
|
# a configurable soak period (_ctrl_unresponsive_tmr), remove that
|
|
# controller. Only Avahi-discovered controllers need this timeout-based
|
|
# cleanup.
|
|
self._ctrl_unresponsive_time = None # The time at which connectivity was lost
|
|
self._ctrl_unresponsive_tmr = gutil.GTimer(0, self._serv.controller_unresponsive, self.tid)
|
|
|
|
def _release_resources(self):
|
|
logging.debug('Dc._release_resources() - %s | %s', self.id, self.device)
|
|
super()._release_resources()
|
|
|
|
if self._ctrl_unresponsive_tmr is not None:
|
|
self._ctrl_unresponsive_tmr.kill()
|
|
|
|
self._log_pages = list()
|
|
self._ctrl_unresponsive_tmr = None
|
|
|
|
def _kill_ops(self):
|
|
super()._kill_ops()
|
|
if self._get_log_op:
|
|
self._get_log_op.kill()
|
|
self._get_log_op = None
|
|
if self._register_op:
|
|
self._register_op.kill()
|
|
self._register_op = None
|
|
if self._get_supported_op:
|
|
self._get_supported_op.kill()
|
|
self._get_supported_op = None
|
|
|
|
def all_ops_completed(self) -> bool:
|
|
'''@brief Returns True if all operations have completed. False otherwise.'''
|
|
return (
|
|
super().all_ops_completed()
|
|
and (self._get_log_op is None or self._get_log_op.completed())
|
|
and (self._register_op is None or self._register_op.completed())
|
|
and (self._get_supported_op is None or self._get_supported_op.completed())
|
|
)
|
|
|
|
@property
|
|
def origin(self):
|
|
'''@brief Return how this controller came into existence. Was it
|
|
"discovered" through mDNS service discovery (TP8009), was it manually
|
|
"configured" in stafd.conf, or was it a "referral".
|
|
'''
|
|
return self._origin
|
|
|
|
@origin.setter
|
|
def origin(self, value):
|
|
'''@brief Set the origin of this controller.'''
|
|
if value in ('discovered', 'configured', 'referral'):
|
|
self._origin = value
|
|
self._handle_lost_controller()
|
|
else:
|
|
logging.error('%s | %s - Trying to set invalid origin to %s', self.id, self.device, value)
|
|
|
|
def reload_hdlr(self):
|
|
'''@brief This is called when a "reload" signal is received.'''
|
|
logging.debug('Dc.reload_hdlr() - %s | %s', self.id, self.device)
|
|
|
|
self._handle_lost_controller()
|
|
self._resync_with_controller()
|
|
|
|
def info(self) -> dict:
|
|
'''@brief Get the controller info for this object'''
|
|
timeout = conf.SvcConf().zeroconf_persistence_sec
|
|
unresponsive_time = (
|
|
time.asctime(self._ctrl_unresponsive_time) if self._ctrl_unresponsive_time is not None else '---'
|
|
)
|
|
info = super().info()
|
|
info['origin'] = self.origin
|
|
if self.origin == 'discovered':
|
|
# The code that handles "unresponsive" DCs only applies to
|
|
# discovered DCs. So, let's only print that info when it's relevant.
|
|
info['unresponsive timer'] = str(self._ctrl_unresponsive_tmr)
|
|
info['unresponsive timeout'] = f'{timeout} sec' if timeout >= 0 else 'forever'
|
|
info['unresponsive time'] = unresponsive_time
|
|
if self._get_log_op:
|
|
info['get log page operation'] = str(self._get_log_op.as_dict())
|
|
if self._register_op:
|
|
info['register operation'] = str(self._register_op.as_dict())
|
|
if self._get_supported_op:
|
|
info['get supported log page operation'] = str(self._get_supported_op.as_dict())
|
|
return info
|
|
|
|
def cancel(self):
|
|
'''@brief Used to cancel pending operations.'''
|
|
super().cancel()
|
|
if self._get_log_op:
|
|
self._get_log_op.cancel()
|
|
if self._register_op:
|
|
self._register_op.cancel()
|
|
if self._get_supported_op:
|
|
self._get_supported_op.cancel()
|
|
|
|
def log_pages(self) -> list:
|
|
'''@brief Get the cached log pages for this object'''
|
|
return self._log_pages
|
|
|
|
def referrals(self) -> list:
|
|
'''@brief Return the list of referrals'''
|
|
return [page for page in self._log_pages if page['subtype'] == 'referral']
|
|
|
|
def _is_ddc(self):
|
|
return self._ctrl and self._ctrl.dctype != 'cdc'
|
|
|
|
def _on_aen(self, aen: int):
|
|
if aen == DLP_CHANGED and self._get_log_op:
|
|
self._get_log_op.run_async()
|
|
|
|
def _handle_lost_controller(self):
|
|
if self.origin == 'discovered': # Only apply to mDNS-discovered DCs
|
|
if not self._serv.is_avahi_reported(self.tid) and not self.connected():
|
|
timeout = conf.SvcConf().zeroconf_persistence_sec
|
|
if timeout >= 0:
|
|
if self._ctrl_unresponsive_time is None:
|
|
self._ctrl_unresponsive_time = time.localtime()
|
|
self._ctrl_unresponsive_tmr.start(timeout)
|
|
logging.info(
|
|
'%s | %s - Controller is not responding. Will be removed by %s unless restored',
|
|
self.id,
|
|
self.device,
|
|
time.ctime(time.mktime(self._ctrl_unresponsive_time) + timeout),
|
|
)
|
|
|
|
return
|
|
|
|
logging.info('%s | %s - Controller not responding. Retrying...', self.id, self.device)
|
|
|
|
self._ctrl_unresponsive_time = None
|
|
self._ctrl_unresponsive_tmr.stop()
|
|
self._ctrl_unresponsive_tmr.set_timeout(0)
|
|
|
|
def is_unresponsive(self):
|
|
'''@brief For "discovered" DC, return True if DC is unresponsive,
|
|
False otherwise.
|
|
'''
|
|
return (
|
|
self.origin == 'discovered'
|
|
and not self._serv.is_avahi_reported(self.tid)
|
|
and not self.connected()
|
|
and self._ctrl_unresponsive_time is not None
|
|
and self._ctrl_unresponsive_tmr.time_remaining() <= 0
|
|
)
|
|
|
|
def _resync_with_controller(self):
|
|
'''Communicate with DC to resync the states'''
|
|
if self._register_op:
|
|
self._register_op.run_async()
|
|
elif self._get_supported_op:
|
|
self._get_supported_op.run_async()
|
|
elif self._get_log_op:
|
|
self._get_log_op.run_async()
|
|
|
|
def _on_nvme_event(self, nvme_event: str):
|
|
if nvme_event in ('connected', 'rediscover'):
|
|
# This event indicates that the kernel
|
|
# driver re-connected to the DC.
|
|
logging.debug(
|
|
'Dc._on_nvme_event() - %s | %s: Received "%s" event',
|
|
self.id,
|
|
self.device,
|
|
nvme_event,
|
|
)
|
|
self._resync_with_controller()
|
|
|
|
def _find_existing_connection(self):
|
|
return self._udev.find_nvme_dc_device(self.tid)
|
|
|
|
def _post_registration_actions(self):
|
|
# Need to check that supported_log_pages() is available (introduced in libnvme 1.2)
|
|
get_slp = getattr(self._ctrl, 'supported_log_pages', None)
|
|
if get_slp is None:
|
|
logging.warning(
|
|
'%s | %s - libnvme-%s does not support "Get supported log pages". Please upgrade libnvme.',
|
|
self.id,
|
|
self.device,
|
|
defs.LIBNVME_VERSION,
|
|
)
|
|
|
|
if conf.SvcConf().pleo_enabled and self._is_ddc() and get_slp is not None:
|
|
self._get_supported_op = gutil.AsyncTask(
|
|
self._on_get_supported_success, self._on_get_supported_fail, get_slp
|
|
)
|
|
self._get_supported_op.run_async()
|
|
else:
|
|
self._get_log_op = gutil.AsyncTask(self._on_get_log_success, self._on_get_log_fail, self._ctrl.discover)
|
|
self._get_log_op.run_async()
|
|
|
|
# --------------------------------------------------------------------------
|
|
def _on_connect_success(self, op_obj: gutil.AsyncTask, data):
|
|
'''@brief Function called when we successfully connect to the
|
|
Discovery Controller.
|
|
'''
|
|
super()._on_connect_success(op_obj, data)
|
|
|
|
if not self._alive():
|
|
return
|
|
|
|
self._ctrl_unresponsive_time = None
|
|
self._ctrl_unresponsive_tmr.stop()
|
|
self._ctrl_unresponsive_tmr.set_timeout(0)
|
|
|
|
if self._ctrl.is_registration_supported():
|
|
self._register_op = gutil.AsyncTask(
|
|
self._on_registration_success,
|
|
self._on_registration_fail,
|
|
self._ctrl.registration_ctlr,
|
|
nvme.NVMF_DIM_TAS_REGISTER,
|
|
)
|
|
self._register_op.run_async()
|
|
else:
|
|
self._post_registration_actions()
|
|
|
|
def _on_connect_fail(self, op_obj: gutil.AsyncTask, err, fail_cnt):
|
|
'''@brief Function called when we fail to connect to the Controller.'''
|
|
super()._on_connect_fail(op_obj, err, fail_cnt)
|
|
|
|
if self._alive():
|
|
self._handle_lost_controller()
|
|
|
|
# --------------------------------------------------------------------------
|
|
def _on_registration_success(self, op_obj: gutil.AsyncTask, data): # pylint: disable=unused-argument
|
|
'''@brief Function called when we successfully register with the
|
|
Discovery Controller. See self._register_op object
|
|
for details.
|
|
|
|
NOTE: The name _on_registration_success() may be misleading. "success"
|
|
refers to the fact that a successful exchange was made with the DC.
|
|
It doesn't mean that the registration itself succeeded.
|
|
'''
|
|
if not self._alive():
|
|
logging.debug(
|
|
'Dc._on_registration_success() - %s | %s: Received event on dead object.', self.id, self.device
|
|
)
|
|
return
|
|
|
|
if data is not None:
|
|
logging.warning('%s | %s - Registration error. %s.', self.id, self.device, data)
|
|
else:
|
|
logging.debug('Dc._on_registration_success() - %s | %s', self.id, self.device)
|
|
|
|
self._post_registration_actions()
|
|
|
|
def _on_registration_fail(self, op_obj: gutil.AsyncTask, err, fail_cnt):
|
|
'''@brief Function called when we fail to register with the
|
|
Discovery Controller. See self._register_op object
|
|
for details.
|
|
'''
|
|
if not self._alive():
|
|
logging.debug(
|
|
'Dc._on_registration_fail() - %s | %s: Received event on dead object. %s',
|
|
self.id,
|
|
self.device,
|
|
err,
|
|
)
|
|
op_obj.kill()
|
|
return
|
|
|
|
logging.debug(
|
|
'Dc._on_registration_fail() - %s | %s: %s. Retry in %s sec',
|
|
self.id,
|
|
self.device,
|
|
err,
|
|
Dc.REGISTRATION_RETRY_RERIOD_SEC,
|
|
)
|
|
if fail_cnt == 1: # Throttle the logs. Only print the first time the command fails
|
|
logging.error('%s | %s - Failed to register with Discovery Controller. %s', self.id, self.device, err)
|
|
op_obj.retry(Dc.REGISTRATION_RETRY_RERIOD_SEC)
|
|
|
|
# --------------------------------------------------------------------------
|
|
def _on_get_supported_success(self, op_obj: gutil.AsyncTask, data): # pylint: disable=unused-argument
|
|
'''@brief Function called when we successfully retrieved the supported
|
|
log pages from the Discovery Controller. See self._get_supported_op object
|
|
for details.
|
|
|
|
NOTE: The name _on_get_supported_success() may be misleading. "success"
|
|
refers to the fact that a successful exchange was made with the DC.
|
|
It doesn't mean that the Get Supported Log Page itself succeeded.
|
|
'''
|
|
if not self._alive():
|
|
logging.debug(
|
|
'Dc._on_get_supported_success() - %s | %s: Received event on dead object.', self.id, self.device
|
|
)
|
|
return
|
|
|
|
try:
|
|
dlp_supp_opts = data[nvme.NVME_LOG_LID_DISCOVER] >> 16
|
|
except (TypeError, IndexError):
|
|
dlp_supp_opts = 0
|
|
|
|
logging.debug(
|
|
'Dc._on_get_supported_success() - %s | %s: supported options = 0x%04X = %s',
|
|
self.id,
|
|
self.device,
|
|
dlp_supp_opts,
|
|
dlp_supp_opts_as_string(dlp_supp_opts),
|
|
)
|
|
|
|
if 'lsp' in inspect.signature(self._ctrl.discover).parameters:
|
|
lsp = nvme.NVMF_LOG_DISC_LSP_PLEO if dlp_supp_opts & nvme.NVMF_LOG_DISC_LID_PLEOS else 0
|
|
self._get_log_op = gutil.AsyncTask(
|
|
self._on_get_log_success, self._on_get_log_fail, self._ctrl.discover, lsp
|
|
)
|
|
else:
|
|
logging.warning(
|
|
'%s | %s - libnvme-%s does not support setting PLEO bit. Please upgrade.',
|
|
self.id,
|
|
self.device,
|
|
defs.LIBNVME_VERSION,
|
|
)
|
|
self._get_log_op = gutil.AsyncTask(self._on_get_log_success, self._on_get_log_fail, self._ctrl.discover)
|
|
self._get_log_op.run_async()
|
|
|
|
def _on_get_supported_fail(self, op_obj: gutil.AsyncTask, err, fail_cnt):
|
|
'''@brief Function called when we fail to retrieve the supported log
|
|
page from the Discovery Controller. See self._get_supported_op object
|
|
for details.
|
|
'''
|
|
if not self._alive():
|
|
logging.debug(
|
|
'Dc._on_get_supported_fail() - %s | %s: Received event on dead object. %s',
|
|
self.id,
|
|
self.device,
|
|
err,
|
|
)
|
|
op_obj.kill()
|
|
return
|
|
|
|
logging.debug(
|
|
'Dc._on_get_supported_fail() - %s | %s: %s. Retry in %s sec',
|
|
self.id,
|
|
self.device,
|
|
err,
|
|
Dc.GET_SUPPORTED_RETRY_RERIOD_SEC,
|
|
)
|
|
if fail_cnt == 1: # Throttle the logs. Only print the first time the command fails
|
|
logging.error(
|
|
'%s | %s - Failed to Get supported log pages from Discovery Controller. %s',
|
|
self.id,
|
|
self.device,
|
|
err,
|
|
)
|
|
op_obj.retry(Dc.GET_SUPPORTED_RETRY_RERIOD_SEC)
|
|
|
|
# --------------------------------------------------------------------------
|
|
def _on_get_log_success(self, op_obj: gutil.AsyncTask, data): # pylint: disable=unused-argument
|
|
'''@brief Function called when we successfully retrieve the log pages
|
|
from the Discovery Controller. See self._get_log_op object
|
|
for details.
|
|
'''
|
|
if not self._alive():
|
|
logging.debug(
|
|
'Dc._on_get_log_success() - %s | %s: Received event on dead object.', self.id, self.device
|
|
)
|
|
return
|
|
|
|
# Note that for historical reasons too long to explain, the CDC may
|
|
# return invalid addresses ('0.0.0.0', '::', or ''). Those need to
|
|
# be filtered out.
|
|
referrals_before = self.referrals()
|
|
self._log_pages = (
|
|
[
|
|
{k.strip(): str(v).strip() for k, v in dictionary.items()}
|
|
for dictionary in data
|
|
if dictionary.get('traddr', '').strip() not in ('0.0.0.0', '::', '')
|
|
]
|
|
if data
|
|
else list()
|
|
)
|
|
logging.info(
|
|
'%s | %s - Received discovery log pages (num records=%s).', self.id, self.device, len(self._log_pages)
|
|
)
|
|
referrals_after = self.referrals()
|
|
self._serv.log_pages_changed(self, self.device)
|
|
if referrals_after != referrals_before:
|
|
logging.debug(
|
|
'Dc._on_get_log_success() - %s | %s: Referrals before = %s',
|
|
self.id,
|
|
self.device,
|
|
referrals_before,
|
|
)
|
|
logging.debug(
|
|
'Dc._on_get_log_success() - %s | %s: Referrals after = %s',
|
|
self.id,
|
|
self.device,
|
|
referrals_after,
|
|
)
|
|
self._serv.referrals_changed()
|
|
|
|
def _on_get_log_fail(self, op_obj: gutil.AsyncTask, err, fail_cnt):
|
|
'''@brief Function called when we fail to retrieve the log pages
|
|
from the Discovery Controller. See self._get_log_op object
|
|
for details.
|
|
'''
|
|
if not self._alive():
|
|
logging.debug(
|
|
'Dc._on_get_log_fail() - %s | %s: Received event on dead object. %s',
|
|
self.id,
|
|
self.device,
|
|
err,
|
|
)
|
|
op_obj.kill()
|
|
return
|
|
|
|
logging.debug(
|
|
'Dc._on_get_log_fail() - %s | %s: %s. Retry in %s sec',
|
|
self.id,
|
|
self.device,
|
|
err,
|
|
Dc.GET_LOG_PAGE_RETRY_RERIOD_SEC,
|
|
)
|
|
if fail_cnt == 1: # Throttle the logs. Only print the first time the command fails
|
|
logging.error('%s | %s - Failed to retrieve log pages. %s', self.id, self.device, err)
|
|
op_obj.retry(Dc.GET_LOG_PAGE_RETRY_RERIOD_SEC)
|
|
|
|
|
|
# ******************************************************************************
|
|
class Ioc(Controller):
|
|
'''@brief This object establishes a connection to one I/O Controller.'''
|
|
|
|
def __init__(self, stac, tid: trid.TID):
|
|
self._dlpe = None
|
|
super().__init__(tid, stac)
|
|
|
|
def _find_existing_connection(self):
|
|
return self._udev.find_nvme_ioc_device(self.tid)
|
|
|
|
def _on_aen(self, aen: int):
|
|
pass
|
|
|
|
def _on_nvme_event(self, nvme_event):
|
|
pass
|
|
|
|
def reload_hdlr(self):
|
|
'''@brief This is called when a "reload" signal is received.'''
|
|
if not self.connected() and self._retry_connect_tmr.time_remaining() == 0:
|
|
self._try_to_connect_deferred.schedule()
|
|
|
|
@property
|
|
def eflags(self):
|
|
'''@brief Return the eflag field of the DLPE'''
|
|
return get_eflags(self._dlpe)
|
|
|
|
@property
|
|
def ncc(self):
|
|
'''@brief Return Not Connected to CDC status'''
|
|
return get_ncc(self.eflags)
|
|
|
|
def details(self) -> dict:
|
|
'''@brief return detailed debug info about this controller'''
|
|
details = super().details()
|
|
details['dlpe'] = str(self._dlpe)
|
|
details['dlpe.eflags.ncc'] = str(self.ncc)
|
|
return details
|
|
|
|
def update_dlpe(self, dlpe):
|
|
'''@brief This method is called when a new DLPE associated
|
|
with this controller is received.'''
|
|
new_ncc = get_ncc(get_eflags(dlpe))
|
|
old_ncc = self.ncc
|
|
self._dlpe = dlpe
|
|
|
|
if old_ncc and not new_ncc: # NCC bit cleared?
|
|
if not self.connected():
|
|
self._connect_attempts = 0
|
|
self._try_to_connect_deferred.schedule()
|
|
|
|
def _should_try_to_reconnect(self):
|
|
'''@brief This is used to determine when it's time to stop trying to connect'''
|
|
max_connect_attempts = conf.SvcConf().connect_attempts_on_ncc if self.ncc else 0
|
|
return max_connect_attempts == 0 or self._connect_attempts < max_connect_attempts
|