1
0
Fork 0

Merging upstream version 2.3~rc1.

Signed-off-by: Daniel Baumann <daniel@debian.org>
This commit is contained in:
Daniel Baumann 2025-02-16 12:53:58 +01:00
parent 972d2d9aa2
commit ca2ec6771a
Signed by: daniel
GPG key ID: FBB4F0E80A80222F
37 changed files with 1946 additions and 407 deletions

View file

@ -135,26 +135,7 @@ class Controller(stas.ControllerABC): # pylint: disable=too-many-instance-attri
self._root.log_level("debug" if tron else "err")
def _on_udev_notification(self, udev_obj):
if self._alive():
if udev_obj.action == 'change':
nvme_aen = udev_obj.get('NVME_AEN')
nvme_event = udev_obj.get('NVME_EVENT')
if isinstance(nvme_aen, str):
logging.info('%s | %s - Received AEN: %s', self.id, udev_obj.sys_name, nvme_aen)
self._on_aen(int(nvme_aen, 16))
if isinstance(nvme_event, str):
self._on_nvme_event(nvme_event)
elif udev_obj.action == 'remove':
logging.info('%s | %s - Received "remove" event', self.id, udev_obj.sys_name)
self._on_ctrl_removed(udev_obj)
else:
logging.debug(
'Controller._on_udev_notification() - %s | %s: Received "%s" event',
self.id,
udev_obj.sys_name,
udev_obj.action,
)
else:
if not self._alive():
logging.debug(
'Controller._on_udev_notification() - %s | %s: Received event on dead object. udev_obj %s: %s',
self.id,
@ -162,6 +143,26 @@ class Controller(stas.ControllerABC): # pylint: disable=too-many-instance-attri
udev_obj.action,
udev_obj.sys_name,
)
return
if udev_obj.action == 'change':
nvme_aen = udev_obj.get('NVME_AEN')
nvme_event = udev_obj.get('NVME_EVENT')
if isinstance(nvme_aen, str):
logging.info('%s | %s - Received AEN: %s', self.id, udev_obj.sys_name, nvme_aen)
self._on_aen(int(nvme_aen, 16))
if isinstance(nvme_event, str):
self._on_nvme_event(nvme_event)
elif udev_obj.action == 'remove':
logging.info('%s | %s - Received "remove" event', self.id, udev_obj.sys_name)
self._on_ctrl_removed(udev_obj)
else:
logging.debug(
'Controller._on_udev_notification() - %s | %s: Received "%s" event',
self.id,
udev_obj.sys_name,
udev_obj.action,
)
def _on_ctrl_removed(self, udev_obj): # pylint: disable=unused-argument
if self._udev:
@ -269,48 +270,51 @@ class Controller(stas.ControllerABC): # pylint: disable=too-many-instance-attri
op_obj.kill()
self._connect_op = None
if self._alive():
self._device = self._ctrl.name
logging.info('%s | %s - Connection established!', self.id, self.device)
self._connect_attempts = 0
self._udev.register_for_device_events(self._device, self._on_udev_notification)
else:
if not self._alive():
logging.debug(
'Controller._on_connect_success() - %s | %s: Received event on dead object. data=%s',
self.id,
self.device,
data,
)
return
self._device = self._ctrl.name
logging.info('%s | %s - Connection established!', self.id, self.device)
self._connect_attempts = 0
self._udev.register_for_device_events(self._device, self._on_udev_notification)
def _on_connect_fail(self, op_obj: gutil.AsyncTask, err, fail_cnt): # pylint: disable=unused-argument
'''@brief Function called when we fail to connect to the Controller.'''
op_obj.kill()
self._connect_op = None
if self._alive():
if self._connect_attempts == 1:
# Do a fast re-try on the first failure.
self._retry_connect_tmr.set_timeout(self.FAST_CONNECT_RETRY_PERIOD_SEC)
elif self._connect_attempts == 2:
# If the fast connect re-try fails, then we can print a message to
# indicate the failure, and start a slow re-try period.
self._retry_connect_tmr.set_timeout(self.CONNECT_RETRY_PERIOD_SEC)
logging.error('%s Failed to connect to controller. %s %s', self.id, err.domain, err.message)
if self._should_try_to_reconnect():
logging.debug(
'Controller._on_connect_fail() - %s %s. Retry in %s sec.',
self.id,
err,
self._retry_connect_tmr.get_timeout(),
)
self._retry_connect_tmr.start()
else:
if not self._alive():
logging.debug(
'Controller._on_connect_fail() - %s Received event on dead object. %s %s',
self.id,
err.domain,
err.message,
)
return
if self._connect_attempts == 1:
# Do a fast re-try on the first failure.
self._retry_connect_tmr.set_timeout(self.FAST_CONNECT_RETRY_PERIOD_SEC)
elif self._connect_attempts == 2:
# If the fast connect re-try fails, then we can print a message to
# indicate the failure, and start a slow re-try period.
self._retry_connect_tmr.set_timeout(self.CONNECT_RETRY_PERIOD_SEC)
logging.error('%s Failed to connect to controller. %s %s', self.id, err.domain, err.message)
if self._should_try_to_reconnect():
logging.debug(
'Controller._on_connect_fail() - %s %s. Retry in %s sec.',
self.id,
err,
self._retry_connect_tmr.get_timeout(),
)
self._retry_connect_tmr.start()
def disconnect(self, disconnected_cb, keep_connection):
'''@brief Issue an asynchronous disconnect command to a Controller.
@ -507,11 +511,7 @@ class Dc(Controller):
return
logging.info(
'%s | %s - Controller not responding. Retrying...',
self.id,
self.device,
)
logging.info('%s | %s - Controller not responding. Retrying...', self.id, self.device)
self._ctrl_unresponsive_time = None
self._ctrl_unresponsive_tmr.stop()
@ -555,8 +555,8 @@ class Dc(Controller):
def _post_registration_actions(self):
# Need to check that supported_log_pages() is available (introduced in libnvme 1.2)
has_supported_log_pages = hasattr(self._ctrl, 'supported_log_pages')
if not has_supported_log_pages:
get_slp = getattr(self._ctrl, 'supported_log_pages', None)
if get_slp is None:
logging.warning(
'%s | %s - libnvme-%s does not support "Get supported log pages". Please upgrade libnvme.',
self.id,
@ -564,9 +564,9 @@ class Dc(Controller):
defs.LIBNVME_VERSION,
)
if conf.SvcConf().pleo_enabled and self._is_ddc() and has_supported_log_pages:
if conf.SvcConf().pleo_enabled and self._is_ddc() and get_slp is not None:
self._get_supported_op = gutil.AsyncTask(
self._on_get_supported_success, self._on_get_supported_fail, self._ctrl.supported_log_pages
self._on_get_supported_success, self._on_get_supported_fail, get_slp
)
self._get_supported_op.run_async()
else:
@ -580,21 +580,23 @@ class Dc(Controller):
'''
super()._on_connect_success(op_obj, data)
if self._alive():
self._ctrl_unresponsive_time = None
self._ctrl_unresponsive_tmr.stop()
self._ctrl_unresponsive_tmr.set_timeout(0)
if not self._alive():
return
if self._ctrl.is_registration_supported():
self._register_op = gutil.AsyncTask(
self._on_registration_success,
self._on_registration_fail,
self._ctrl.registration_ctlr,
nvme.NVMF_DIM_TAS_REGISTER,
)
self._register_op.run_async()
else:
self._post_registration_actions()
self._ctrl_unresponsive_time = None
self._ctrl_unresponsive_tmr.stop()
self._ctrl_unresponsive_tmr.set_timeout(0)
if self._ctrl.is_registration_supported():
self._register_op = gutil.AsyncTask(
self._on_registration_success,
self._on_registration_fail,
self._ctrl.registration_ctlr,
nvme.NVMF_DIM_TAS_REGISTER,
)
self._register_op.run_async()
else:
self._post_registration_actions()
def _on_connect_fail(self, op_obj: gutil.AsyncTask, err, fail_cnt):
'''@brief Function called when we fail to connect to the Controller.'''
@ -613,35 +615,25 @@ class Dc(Controller):
refers to the fact that a successful exchange was made with the DC.
It doesn't mean that the registration itself succeeded.
'''
if self._alive():
if data is not None:
logging.warning('%s | %s - Registration error. %s.', self.id, self.device, data)
else:
logging.debug('Dc._on_registration_success() - %s | %s', self.id, self.device)
self._post_registration_actions()
else:
if not self._alive():
logging.debug(
'Dc._on_registration_success() - %s | %s: Received event on dead object.', self.id, self.device
)
return
if data is not None:
logging.warning('%s | %s - Registration error. %s.', self.id, self.device, data)
else:
logging.debug('Dc._on_registration_success() - %s | %s', self.id, self.device)
self._post_registration_actions()
def _on_registration_fail(self, op_obj: gutil.AsyncTask, err, fail_cnt):
'''@brief Function called when we fail to register with the
Discovery Controller. See self._register_op object
for details.
'''
if self._alive():
logging.debug(
'Dc._on_registration_fail() - %s | %s: %s. Retry in %s sec',
self.id,
self.device,
err,
Dc.REGISTRATION_RETRY_RERIOD_SEC,
)
if fail_cnt == 1: # Throttle the logs. Only print the first time the command fails
logging.error('%s | %s - Failed to register with Discovery Controller. %s', self.id, self.device, err)
op_obj.retry(Dc.REGISTRATION_RETRY_RERIOD_SEC)
else:
if not self._alive():
logging.debug(
'Dc._on_registration_fail() - %s | %s: Received event on dead object. %s',
self.id,
@ -649,6 +641,18 @@ class Dc(Controller):
err,
)
op_obj.kill()
return
logging.debug(
'Dc._on_registration_fail() - %s | %s: %s. Retry in %s sec',
self.id,
self.device,
err,
Dc.REGISTRATION_RETRY_RERIOD_SEC,
)
if fail_cnt == 1: # Throttle the logs. Only print the first time the command fails
logging.error('%s | %s - Failed to register with Discovery Controller. %s', self.id, self.device, err)
op_obj.retry(Dc.REGISTRATION_RETRY_RERIOD_SEC)
# --------------------------------------------------------------------------
def _on_get_supported_success(self, op_obj: gutil.AsyncTask, data): # pylint: disable=unused-argument
@ -660,61 +664,46 @@ class Dc(Controller):
refers to the fact that a successful exchange was made with the DC.
It doesn't mean that the Get Supported Log Page itself succeeded.
'''
if self._alive():
try:
dlp_supp_opts = data[nvme.NVME_LOG_LID_DISCOVER] >> 16
except (TypeError, IndexError):
dlp_supp_opts = 0
logging.debug(
'Dc._on_get_supported_success() - %s | %s: supported options = 0x%04X = %s',
self.id,
self.device,
dlp_supp_opts,
dlp_supp_opts_as_string(dlp_supp_opts),
)
if 'lsp' in inspect.signature(self._ctrl.discover).parameters:
lsp = nvme.NVMF_LOG_DISC_LSP_PLEO if dlp_supp_opts & nvme.NVMF_LOG_DISC_LID_PLEOS else 0
self._get_log_op = gutil.AsyncTask(
self._on_get_log_success, self._on_get_log_fail, self._ctrl.discover, lsp
)
else:
logging.warning(
'%s | %s - libnvme-%s does not support setting PLEO bit. Please upgrade.',
self.id,
self.device,
defs.LIBNVME_VERSION,
)
self._get_log_op = gutil.AsyncTask(self._on_get_log_success, self._on_get_log_fail, self._ctrl.discover)
self._get_log_op.run_async()
else:
if not self._alive():
logging.debug(
'Dc._on_get_supported_success() - %s | %s: Received event on dead object.', self.id, self.device
)
return
try:
dlp_supp_opts = data[nvme.NVME_LOG_LID_DISCOVER] >> 16
except (TypeError, IndexError):
dlp_supp_opts = 0
logging.debug(
'Dc._on_get_supported_success() - %s | %s: supported options = 0x%04X = %s',
self.id,
self.device,
dlp_supp_opts,
dlp_supp_opts_as_string(dlp_supp_opts),
)
if 'lsp' in inspect.signature(self._ctrl.discover).parameters:
lsp = nvme.NVMF_LOG_DISC_LSP_PLEO if dlp_supp_opts & nvme.NVMF_LOG_DISC_LID_PLEOS else 0
self._get_log_op = gutil.AsyncTask(
self._on_get_log_success, self._on_get_log_fail, self._ctrl.discover, lsp
)
else:
logging.warning(
'%s | %s - libnvme-%s does not support setting PLEO bit. Please upgrade.',
self.id,
self.device,
defs.LIBNVME_VERSION,
)
self._get_log_op = gutil.AsyncTask(self._on_get_log_success, self._on_get_log_fail, self._ctrl.discover)
self._get_log_op.run_async()
def _on_get_supported_fail(self, op_obj: gutil.AsyncTask, err, fail_cnt):
'''@brief Function called when we fail to retrieve the supported log
page from the Discovery Controller. See self._get_supported_op object
for details.
'''
if self._alive():
logging.debug(
'Dc._on_get_supported_fail() - %s | %s: %s. Retry in %s sec',
self.id,
self.device,
err,
Dc.GET_SUPPORTED_RETRY_RERIOD_SEC,
)
if fail_cnt == 1: # Throttle the logs. Only print the first time the command fails
logging.error(
'%s | %s - Failed to Get supported log pages from Discovery Controller. %s',
self.id,
self.device,
err,
)
op_obj.retry(Dc.GET_SUPPORTED_RETRY_RERIOD_SEC)
else:
if not self._alive():
logging.debug(
'Dc._on_get_supported_fail() - %s | %s: Received event on dead object. %s',
self.id,
@ -722,6 +711,23 @@ class Dc(Controller):
err,
)
op_obj.kill()
return
logging.debug(
'Dc._on_get_supported_fail() - %s | %s: %s. Retry in %s sec',
self.id,
self.device,
err,
Dc.GET_SUPPORTED_RETRY_RERIOD_SEC,
)
if fail_cnt == 1: # Throttle the logs. Only print the first time the command fails
logging.error(
'%s | %s - Failed to Get supported log pages from Discovery Controller. %s',
self.id,
self.device,
err,
)
op_obj.retry(Dc.GET_SUPPORTED_RETRY_RERIOD_SEC)
# --------------------------------------------------------------------------
def _on_get_log_success(self, op_obj: gutil.AsyncTask, data): # pylint: disable=unused-argument
@ -729,61 +735,51 @@ class Dc(Controller):
from the Discovery Controller. See self._get_log_op object
for details.
'''
if self._alive():
# Note that for historical reasons too long to explain, the CDC may
# return invalid addresses ('0.0.0.0', '::', or ''). Those need to
# be filtered out.
referrals_before = self.referrals()
self._log_pages = (
[
{k.strip(): str(v).strip() for k, v in dictionary.items()}
for dictionary in data
if dictionary.get('traddr', '').strip() not in ('0.0.0.0', '::', '')
]
if data
else list()
)
logging.info(
'%s | %s - Received discovery log pages (num records=%s).', self.id, self.device, len(self._log_pages)
)
referrals_after = self.referrals()
self._serv.log_pages_changed(self, self.device)
if referrals_after != referrals_before:
logging.debug(
'Dc._on_get_log_success() - %s | %s: Referrals before = %s',
self.id,
self.device,
referrals_before,
)
logging.debug(
'Dc._on_get_log_success() - %s | %s: Referrals after = %s',
self.id,
self.device,
referrals_after,
)
self._serv.referrals_changed()
else:
if not self._alive():
logging.debug(
'Dc._on_get_log_success() - %s | %s: Received event on dead object.', self.id, self.device
)
return
# Note that for historical reasons too long to explain, the CDC may
# return invalid addresses ('0.0.0.0', '::', or ''). Those need to
# be filtered out.
referrals_before = self.referrals()
self._log_pages = (
[
{k.strip(): str(v).strip() for k, v in dictionary.items()}
for dictionary in data
if dictionary.get('traddr', '').strip() not in ('0.0.0.0', '::', '')
]
if data
else list()
)
logging.info(
'%s | %s - Received discovery log pages (num records=%s).', self.id, self.device, len(self._log_pages)
)
referrals_after = self.referrals()
self._serv.log_pages_changed(self, self.device)
if referrals_after != referrals_before:
logging.debug(
'Dc._on_get_log_success() - %s | %s: Referrals before = %s',
self.id,
self.device,
referrals_before,
)
logging.debug(
'Dc._on_get_log_success() - %s | %s: Referrals after = %s',
self.id,
self.device,
referrals_after,
)
self._serv.referrals_changed()
def _on_get_log_fail(self, op_obj: gutil.AsyncTask, err, fail_cnt):
'''@brief Function called when we fail to retrieve the log pages
from the Discovery Controller. See self._get_log_op object
for details.
'''
if self._alive():
logging.debug(
'Dc._on_get_log_fail() - %s | %s: %s. Retry in %s sec',
self.id,
self.device,
err,
Dc.GET_LOG_PAGE_RETRY_RERIOD_SEC,
)
if fail_cnt == 1: # Throttle the logs. Only print the first time the command fails
logging.error('%s | %s - Failed to retrieve log pages. %s', self.id, self.device, err)
op_obj.retry(Dc.GET_LOG_PAGE_RETRY_RERIOD_SEC)
else:
if not self._alive():
logging.debug(
'Dc._on_get_log_fail() - %s | %s: Received event on dead object. %s',
self.id,
@ -791,6 +787,18 @@ class Dc(Controller):
err,
)
op_obj.kill()
return
logging.debug(
'Dc._on_get_log_fail() - %s | %s: %s. Retry in %s sec',
self.id,
self.device,
err,
Dc.GET_LOG_PAGE_RETRY_RERIOD_SEC,
)
if fail_cnt == 1: # Throttle the logs. Only print the first time the command fails
logging.error('%s | %s - Failed to retrieve log pages. %s', self.id, self.device, err)
op_obj.retry(Dc.GET_LOG_PAGE_RETRY_RERIOD_SEC)
# ******************************************************************************