From 101fc29686a4bf327521f7a4025a6c301db89e84 Mon Sep 17 00:00:00 2001 From: Mudit Date: Thu, 10 Sep 2020 10:29:47 -0400 Subject: [PATCH] Add GPU reporting to idrac-wsman inspect interface This patch implements reporting number of NVIDIA Tesla T4 devices connected to a system by discovering such devices and reporting them through capability 'pci_gpu_devices'. Change-Id: If713895f05f08a9827c4c085108abb3e388b2a2e Story: 2008118 Task: 40839 Depends-On: https://review.opendev.org/#/c/750364/ --- doc/source/admin/drivers/idrac.rst | 1 + driver-requirements.txt | 2 +- ironic/drivers/modules/drac/inspect.py | 24 +++- .../unit/drivers/modules/drac/test_inspect.py | 115 +++++++++++++++++- ...pu-reporting-support-f4d80e2071f85f6a.yaml | 8 ++ 5 files changed, 145 insertions(+), 5 deletions(-) create mode 100644 releasenotes/notes/idrac-add-gpu-reporting-support-f4d80e2071f85f6a.yaml diff --git a/doc/source/admin/drivers/idrac.rst b/doc/source/admin/drivers/idrac.rst index 15a1a6671a..f77eb8dc4f 100644 --- a/doc/source/admin/drivers/idrac.rst +++ b/doc/source/admin/drivers/idrac.rst @@ -259,6 +259,7 @@ The inspection discovers the following properties: Extra capabilities: * ``boot_mode``: UEFI or BIOS boot mode. +* ``pci_gpu_devices``: number of GPU devices connected to the bare metal. It also creates baremetal ports for each NIC port detected in the system. The ``idrac-wsman`` inspect interface discovers which NIC ports are diff --git a/driver-requirements.txt b/driver-requirements.txt index ec736ad6c2..b00680fa2d 100644 --- a/driver-requirements.txt +++ b/driver-requirements.txt @@ -7,7 +7,7 @@ proliantutils>=2.10.0 pysnmp>=4.3.0,<5.0.0 python-scciclient>=0.8.0 -python-dracclient>=3.1.0,<6.0.0 +python-dracclient>=5.1.0,<6.0.0 python-xclarityclient>=0.1.6 # The Redfish hardware type uses the Sushy library diff --git a/ironic/drivers/modules/drac/inspect.py b/ironic/drivers/modules/drac/inspect.py index 620a32273c..77e48226fc 100644 --- a/ironic/drivers/modules/drac/inspect.py +++ b/ironic/drivers/modules/drac/inspect.py @@ -49,6 +49,8 @@ class DracRedfishInspect(redfish_inspect.RedfishInspect): class DracWSManInspect(base.InspectInterface): + _GPU_SUPPORTED_LIST = {"TU104GL [Tesla T4]"} + def get_properties(self): """Return the properties of the interface. @@ -98,9 +100,12 @@ class DracWSManInspect(base.InspectInterface): properties['cpu_arch'] = 'x86_64' if cpus[0].arch64 else 'x86' bios_settings = client.list_bios_settings() + video_controllers = client.list_video_controllers() current_capabilities = node.properties.get('capabilities', '') new_capabilities = { - 'boot_mode': bios_settings["BootMode"].current_value.lower()} + 'boot_mode': bios_settings["BootMode"].current_value.lower(), + 'pci_gpu_devices': self._calculate_gpus(video_controllers)} + capabilties = utils.get_updated_capabilities(current_capabilities, new_capabilities) properties['capabilities'] = capabilties @@ -190,6 +195,23 @@ class DracWSManInspect(base.InspectInterface): else: return cpu.cores + def _calculate_gpus(self, video_controllers): + """Find actual GPU count. + + This method reports number of NVIDIA Tesla T4 GPU devices present + on the server. + + :param video_controllers: list of video controllers. + + :returns: returns total gpu count. + """ + gpu_cnt = 0 + for video_controller in video_controllers: + for gpu in self._GPU_SUPPORTED_LIST: + if video_controller.description == gpu: + gpu_cnt += 1 + return gpu_cnt + def _get_pxe_dev_nics(self, client, nics, node): """Get a list of pxe device interfaces. diff --git a/ironic/tests/unit/drivers/modules/drac/test_inspect.py b/ironic/tests/unit/drivers/modules/drac/test_inspect.py index 628f3c855d..ecb9346f24 100644 --- a/ironic/tests/unit/drivers/modules/drac/test_inspect.py +++ b/ironic/tests/unit/drivers/modules/drac/test_inspect.py @@ -135,6 +135,23 @@ class DracInspectionTestCase(test_utils.BaseDracTest): 'PxeDev4Interface': None} nic_settings = {'LegacyBootProto': {'current_value': 'PXE'}, 'FQDD': 'NIC.Embedded.1-1-1'} + video_controllers = [ + {'id': 'Video.Embedded.1-1', + 'description': 'Integrated Matrox G200eW3 Graphics Controller', + 'function_number': 0, + 'manufacturer': 'Matrox Electronics Systems Ltd.', + 'pci_device_id': '0536', + 'pci_vendor_id': '102B', + 'pci_subdevice_id': '0737', + 'pci_subvendor_id': '1028'}, + {'id': 'Video.Slot.7-1', + 'description': 'TU104GL [Tesla T4]', + 'function_number': 0, + 'manufacturer': 'NVIDIA Corporation', + 'pci_device_id': '1EB8', + 'pci_vendor_id': '10DE', + 'pci_subdevice_id': '12A2', + 'pci_subvendor_id': '10DE'}] self.memory = [test_utils.dict_to_namedtuple(values=m) for m in memory] self.cpus = [test_utils.dict_to_namedtuple(values=c) for c in cpus] @@ -146,6 +163,8 @@ class DracInspectionTestCase(test_utils.BaseDracTest): self.bios_boot_settings = test_utils.dict_of_object(bios_boot_settings) self.uefi_boot_settings = test_utils.dict_of_object(uefi_boot_settings) self.nic_settings = test_utils.dict_of_object(nic_settings) + self.video_controllers = [test_utils.dict_to_namedtuple(values=vc) + for vc in video_controllers] def test_get_properties(self): expected = drac_common.COMMON_PROPERTIES @@ -161,7 +180,7 @@ class DracInspectionTestCase(test_utils.BaseDracTest): 'local_gb': 1116, 'cpus': 18, 'cpu_arch': 'x86_64', - 'capabilities': 'boot_mode:uefi'} + 'capabilities': 'boot_mode:uefi,pci_gpu_devices:1'} mock_client = mock.Mock() mock_get_drac_client.return_value = mock_client mock_client.list_memory.return_value = self.memory @@ -169,6 +188,8 @@ class DracInspectionTestCase(test_utils.BaseDracTest): mock_client.list_virtual_disks.return_value = self.virtual_disks mock_client.list_nics.return_value = self.nics mock_client.list_bios_settings.return_value = self.uefi_boot_settings + mock_client.list_video_controllers.return_value = \ + self.video_controllers with task_manager.acquire(self.context, self.node.uuid, shared=True) as task: @@ -191,6 +212,8 @@ class DracInspectionTestCase(test_utils.BaseDracTest): mock_client.list_virtual_disks.side_effect = ( drac_exceptions.BaseClientException('boom')) mock_client.list_bios_settings.return_value = self.bios_boot_settings + mock_client.list_video_controllers.return_value = \ + self.video_controllers with task_manager.acquire(self.context, self.node.uuid, shared=True) as task: @@ -207,7 +230,7 @@ class DracInspectionTestCase(test_utils.BaseDracTest): 'local_gb': 279, 'cpus': 18, 'cpu_arch': 'x86_64', - 'capabilities': 'boot_mode:uefi'} + 'capabilities': 'boot_mode:uefi,pci_gpu_devices:1'} mock_client = mock.Mock() mock_get_drac_client.return_value = mock_client mock_client.list_memory.return_value = self.memory @@ -216,6 +239,8 @@ class DracInspectionTestCase(test_utils.BaseDracTest): mock_client.list_physical_disks.return_value = self.physical_disks mock_client.list_nics.return_value = self.nics mock_client.list_bios_settings.return_value = self.uefi_boot_settings + mock_client.list_video_controllers.return_value = \ + self.video_controllers with task_manager.acquire(self.context, self.node.uuid, shared=True) as task: @@ -239,12 +264,94 @@ class DracInspectionTestCase(test_utils.BaseDracTest): mock_client.list_physical_disks.return_value = self.physical_disks mock_client.list_nics.return_value = self.nics mock_client.list_bios_settings.return_value = self.uefi_boot_settings + mock_client.list_video_controllers.return_value = \ + self.video_controllers with task_manager.acquire(self.context, self.node.uuid, shared=True) as task: self.assertRaises(exception.HardwareInspectionFailure, task.driver.inspect.inspect_hardware, task) + @mock.patch.object(drac_common, 'get_drac_client', spec_set=True, + autospec=True) + @mock.patch.object(objects.Port, 'create', spec_set=True, autospec=True) + def test_inspect_hardware_no_supported_gpu(self, mock_port_create, + mock_get_drac_client): + controllers = [ + {'id': 'Video.Embedded.1-1', + 'description': 'Integrated Matrox G200eW3 Graphics Controller', + 'function_number': 0, + 'manufacturer': 'Matrox Electronics Systems Ltd.', + 'pci_device_id': '0536', + 'pci_vendor_id': '102B', + 'pci_subdevice_id': '0737', + 'pci_subvendor_id': '1028'}, + {'id': 'Video.Slot.7-1', + 'description': 'GV100GL [Tesla V100 PCIe 16GB]]', + 'function_number': 0, + 'manufacturer': 'NVIDIA Corporation', + 'pci_device_id': '1DB4', + 'pci_vendor_id': '10DE', + 'pci_subdevice_id': '1214', + 'pci_subvendor_id': '10DE'}] + + expected_node_properties = { + 'memory_mb': 32768, + 'local_gb': 279, + 'cpus': 18, + 'cpu_arch': 'x86_64', + 'capabilities': 'boot_mode:uefi,pci_gpu_devices:0'} + mock_client = mock.Mock() + mock_get_drac_client.return_value = mock_client + mock_client.list_memory.return_value = self.memory + mock_client.list_cpus.return_value = self.cpus + mock_client.list_virtual_disks.return_value = [] + mock_client.list_physical_disks.return_value = self.physical_disks + mock_client.list_nics.return_value = self.nics + mock_client.list_bios_settings.return_value = self.uefi_boot_settings + video_controllers = [test_utils.dict_to_namedtuple(values=vc) + for vc in controllers] + mock_client.list_video_controllers.return_value = video_controllers + + with task_manager.acquire(self.context, self.node.uuid, + shared=True) as task: + return_value = task.driver.inspect.inspect_hardware(task) + + self.node.refresh() + self.assertEqual(expected_node_properties, self.node.properties) + self.assertEqual(states.MANAGEABLE, return_value) + self.assertEqual(2, mock_port_create.call_count) + + @mock.patch.object(drac_common, 'get_drac_client', spec_set=True, + autospec=True) + @mock.patch.object(objects.Port, 'create', spec_set=True, autospec=True) + def test_inspect_hardware_no_gpu(self, mock_port_create, + mock_get_drac_client): + expected_node_properties = { + 'memory_mb': 32768, + 'local_gb': 279, + 'cpus': 18, + 'cpu_arch': 'x86_64', + 'capabilities': 'boot_mode:uefi,pci_gpu_devices:0'} + mock_client = mock.Mock() + mock_get_drac_client.return_value = mock_client + mock_client.list_memory.return_value = self.memory + mock_client.list_cpus.return_value = self.cpus + mock_client.list_virtual_disks.return_value = [] + mock_client.list_physical_disks.return_value = self.physical_disks + mock_client.list_nics.return_value = self.nics + mock_client.list_bios_settings.return_value = self.uefi_boot_settings + mock_client.list_video_controllers.return_value = [] + + with task_manager.acquire(self.context, self.node.uuid, + shared=True) as task: + return_value = task.driver.inspect.inspect_hardware(task) + + self.node.refresh() + self.assertEqual(expected_node_properties, self.node.properties) + self.assertEqual(states.MANAGEABLE, return_value) + self.assertEqual(2, mock_port_create.call_count) + @mock.patch.object(drac_common, 'get_drac_client', spec_set=True, autospec=True) @mock.patch.object(objects.Port, 'create', spec_set=True, autospec=True) @@ -255,7 +362,7 @@ class DracInspectionTestCase(test_utils.BaseDracTest): 'local_gb': 1116, 'cpus': 18, 'cpu_arch': 'x86_64', - 'capabilities': 'boot_mode:uefi'} + 'capabilities': 'boot_mode:uefi,pci_gpu_devices:1'} mock_client = mock.Mock() mock_get_drac_client.return_value = mock_client mock_client.list_memory.return_value = self.memory @@ -263,6 +370,8 @@ class DracInspectionTestCase(test_utils.BaseDracTest): mock_client.list_virtual_disks.return_value = self.virtual_disks mock_client.list_nics.return_value = self.nics mock_client.list_bios_settings.return_value = self.uefi_boot_settings + mock_client.list_video_controllers.return_value = \ + self.video_controllers mock_port_create.side_effect = exception.MACAlreadyExists("boom") diff --git a/releasenotes/notes/idrac-add-gpu-reporting-support-f4d80e2071f85f6a.yaml b/releasenotes/notes/idrac-add-gpu-reporting-support-f4d80e2071f85f6a.yaml new file mode 100644 index 0000000000..fb4a841487 --- /dev/null +++ b/releasenotes/notes/idrac-add-gpu-reporting-support-f4d80e2071f85f6a.yaml @@ -0,0 +1,8 @@ +--- +features: + - | + Adds support in ``idrac-wsman`` inspect hardware interface for reporting + number of GPU devices connected to a system. This information is advertised + through capability ``pci_gpu_devices``, which can be used to make + scheduling decisions for the node. Currently, NVIDIA Tesla T4 GPU devices + are reported.