diff --git a/smartmon.py b/smartmon.py index 82e4231..32edf18 100755 --- a/smartmon.py +++ b/smartmon.py @@ -1,5 +1,10 @@ #!/usr/bin/env python3 +""" +Formatted with Black: +$ black -l 100 nvme_metrics.py +""" + import argparse import collections import csv @@ -9,71 +14,71 @@ import sys from prometheus_client import CollectorRegistry, Gauge, generate_latest -device_info_re = re.compile(r'^(?P[^:]+?)(?:(?:\sis|):)\s*(?P.*)$') +device_info_re = re.compile(r"^(?P[^:]+?)(?:(?:\sis|):)\s*(?P.*)$") +COMPAT_DISK_LABELS = False -ata_error_count_re = re.compile( - r'^Error (\d+) \[\d+\] occurred', re.MULTILINE) +ata_error_count_re = re.compile(r"^Error (\d+) \[\d+\] occurred", re.MULTILINE) -self_test_re = re.compile(r'^SMART.*(PASSED|OK)$', re.MULTILINE) +self_test_re = re.compile(r"^SMART.*(PASSED|OK)$", re.MULTILINE) device_info_map = { - 'Vendor': 'vendor', - 'Product': 'product', - 'Revision': 'revision', - 'Logical Unit id': 'lun_id', - 'Model Family': 'model_family', - 'Device Model': 'device_model', - 'Serial Number': 'serial_number', - 'Serial number': 'serial_number', - 'Firmware Version': 'firmware_version', + "Vendor": "vendor", + "Product": "product", + "Revision": "revision", + "Logical Unit id": "lun_id", + "Model Family": "model_family", + "Device Model": "device_model", + "Serial Number": "serial_number", + "Serial number": "serial_number", + "Firmware Version": "firmware_version", } smart_attributes_whitelist = ( - 'airflow_temperature_cel', - 'command_timeout', - 'current_pending_sector', - 'end_to_end_error', - 'erase_fail_count_total', - 'g_sense_error_rate', - 'hardware_ecc_recovered', - 'host_reads_mib', - 'host_reads_32mib', - 'host_writes_mib', - 'host_writes_32mib', - 'load_cycle_count', - 'lifetime_writes_gib', - 'media_wearout_indicator', - 'percent_lifetime_remain', - 'wear_leveling_count', - 'nand_writes_1gib', - 'offline_uncorrectable', - 'percent_lifetime_remain', - 'power_cycle_count', - 'power_on_hours', - 'program_fail_count', - 'raw_read_error_rate', - 'reallocated_event_count', - 'reallocated_sector_ct', - 'reported_uncorrect', - 'sata_downshift_count', - 'seek_error_rate', - 'spin_retry_count', - 'spin_up_time', - 'start_stop_count', - 'temperature_case', - 'temperature_celsius', - 'temperature_internal', - 'total_bad_block', - 'total_lbas_read', - 'total_lbas_written', - 'total_writes_gib', - 'total_reads_gib', - 'udma_crc_error_count', - 'unsafe_shutdown_count', - 'unexpect_power_loss_ct', - 'workld_host_reads_perc', - 'workld_media_wear_indic', - 'workload_minutes', + "airflow_temperature_cel", + "command_timeout", + "current_pending_sector", + "end_to_end_error", + "erase_fail_count_total", + "g_sense_error_rate", + "hardware_ecc_recovered", + "host_reads_mib", + "host_reads_32mib", + "host_writes_mib", + "host_writes_32mib", + "load_cycle_count", + "lifetime_writes_gib", + "media_wearout_indicator", + "percent_lifetime_remain", + "wear_leveling_count", + "nand_writes_1gib", + "offline_uncorrectable", + "percent_lifetime_remain", + "power_cycle_count", + "power_on_hours", + "program_fail_count", + "raw_read_error_rate", + "reallocated_event_count", + "reallocated_sector_ct", + "reported_uncorrect", + "sata_downshift_count", + "seek_error_rate", + "spin_retry_count", + "spin_up_time", + "start_stop_count", + "temperature_case", + "temperature_celsius", + "temperature_internal", + "total_bad_block", + "total_lbas_read", + "total_lbas_written", + "total_writes_gib", + "total_reads_gib", + "udma_crc_error_count", + "unsafe_shutdown_count", + "unexpect_power_loss_ct", + "workld_host_reads_perc", + "workld_media_wear_indic", + "workload_minutes", ) registry = CollectorRegistry() @@ -140,7 +145,6 @@ namespace=namespace, registry=registry, ), - # SMART attributes - ATA disks only "attr_value": Gauge( "attr_value", @@ -179,13 +183,24 @@ ), } -SmartAttribute = collections.namedtuple('SmartAttribute', [ - 'id', 'name', 'flag', 'value', 'worst', 'threshold', 'type', 'updated', - 'when_failed', 'raw_value', -]) +SmartAttribute = collections.namedtuple( + "SmartAttribute", + [ + "id", + "name", + "flag", + "value", + "worst", + "threshold", + "type", + "updated", + "when_failed", + "raw_value", + ], +) -class Device(collections.namedtuple('DeviceBase', 'path opts')): +class Device(collections.namedtuple("DeviceBase", "path opts")): """Representation of a device as found by smartctl --scan output.""" @property @@ -194,10 +209,12 @@ def type(self): @property def base_labels(self): - return {'device': self.path, 'disk': self.type.partition('+')[2] or '0'} + if COMPAT_DISK_LABELS: + return {"disk": self.path, "device": self.type.partition("+")[2] or "0"} + return {"device": self.path, "disk": self.type.partition("+")[2] or "0"} def smartctl_select(self): - return ['--device', self.type, self.path] + return ["--device", self.type, self.path] def smart_ctl(*args, check=True): @@ -206,13 +223,13 @@ def smart_ctl(*args, check=True): Returns: (str) Data piped to stdout by the smartctl subprocess. """ - return subprocess.run( - ['smartctl', *args], stdout=subprocess.PIPE, check=check - ).stdout.decode('utf-8') + return subprocess.run(["smartctl", *args], stdout=subprocess.PIPE, check=check).stdout.decode( + "utf-8" + ) def smart_ctl_version(): - return smart_ctl('-V').split('\n')[0].split()[1] + return smart_ctl("-V").split("\n")[0].split()[1] def find_devices(by_id): @@ -222,14 +239,14 @@ def find_devices(by_id): (Device) Single device found by smartctl. """ parser = argparse.ArgumentParser() - parser.add_argument('-d', '--device', dest='type') + parser.add_argument("-d", "--device", dest="type") - args = ['--scan-open'] + args = ["--scan-open"] if by_id: - args.extend(['-d', 'by-id']) + args.extend(["-d", "by-id"]) devices = smart_ctl(*args) - for device in devices.split('\n'): + for device in devices.split("\n"): device = device.strip() if not device: continue @@ -251,7 +268,7 @@ def device_is_active(device): (bool) True if the device is active and False otherwise. """ try: - smart_ctl('--nocheck', 'standby', *device.smartctl_select()) + smart_ctl("--nocheck", "standby", *device.smartctl_select()) except subprocess.CalledProcessError: return False @@ -270,9 +287,7 @@ def device_info(device): key (str): Key describing the value. value (str): Actual value. """ - info_lines = smart_ctl( - '--info', *device.smartctl_select() - ).strip().split('\n')[3:] + info_lines = smart_ctl("--info", *device.smartctl_select()).strip().split("\n")[3:] matches = (device_info_re.match(line) for line in info_lines) return (m.groups() for m in matches if m is not None) @@ -292,12 +307,10 @@ def device_smart_capabilities(device): """ groups = device_info(device) - state = { - g[1].split(' ', 1)[0] - for g in groups if g[0] == 'SMART support'} + state = {g[1].split(" ", 1)[0] for g in groups if g[0] == "SMART support"} - smart_available = 'Available' in state - smart_enabled = 'Enabled' in state + smart_available = "Available" in state + smart_enabled = "Enabled" in state return smart_available, smart_enabled @@ -329,7 +342,7 @@ def collect_device_health_self_assessment(device): Args: device: (Device) Device in question. """ - out = smart_ctl('--health', *device.smartctl_select(), check=False) + out = smart_ctl("--health", *device.smartctl_select(), check=False) self_assessment_passed = bool(self_test_re.search(out)) metrics["device_smart_healthy"].labels( @@ -339,17 +352,15 @@ def collect_device_health_self_assessment(device): def collect_ata_metrics(device): # Fetch SMART attributes for the given device. - attributes = smart_ctl( - '--attributes', *device.smartctl_select() - ) + attributes = smart_ctl("--attributes", *device.smartctl_select()) # replace multiple occurrences of whitespace with a single whitespace # so that the CSV Parser recognizes individual columns properly. - attributes = re.sub(r'[\t\x20]+', ' ', attributes) + attributes = re.sub(r"[\t\x20]+", " ", attributes) # Turn smartctl output into a list of lines and skip to the table of # SMART attributes. - attribute_lines = attributes.strip().split('\n')[7:] + attribute_lines = attributes.strip().split("\n")[7:] # Some attributes have multiple IDs but have the same name. Don't # yield attributes that already have been reported before. @@ -358,38 +369,40 @@ def collect_ata_metrics(device): reader = csv.DictReader( (line.strip() for line in attribute_lines), fieldnames=SmartAttribute._fields[:-1], - restkey=SmartAttribute._fields[-1], delimiter=' ') + restkey=SmartAttribute._fields[-1], + delimiter=" ", + ) for entry in reader: # We're only interested in the SMART attributes that are # whitelisted here. - entry['name'] = entry['name'].lower() - if entry['name'] not in smart_attributes_whitelist: + entry["name"] = entry["name"].lower() + if entry["name"] not in smart_attributes_whitelist: continue # Ensure that only the numeric parts are fetched from the raw_value. # Attributes such as 194 Temperature_Celsius reported by my SSD # are in the format of "36 (Min/Max 24/40)" which can't be expressed # properly as a prometheus metric. - m = re.match(r'^(\d+)', ' '.join(entry['raw_value'])) + m = re.match(r"^(\d+)", " ".join(entry["raw_value"])) if not m: continue - entry['raw_value'] = m.group(1) + entry["raw_value"] = m.group(1) # Some device models report "---" in the threshold value where most # devices would report "000". We do the substitution here because # downstream code expects values to be convertible to integer. - if entry['threshold'] == '---': - entry['threshold'] = '0' + if entry["threshold"] == "---": + entry["threshold"] = "0" - if entry['name'] in smart_attributes_whitelist and entry['name'] not in seen: - for col in 'value', 'worst', 'threshold', 'raw_value': + if entry["name"] in smart_attributes_whitelist and entry["name"] not in seen: + for col in "value", "worst", "threshold", "raw_value": metrics["attr_" + col].labels( device.base_labels["device"], device.base_labels["disk"], entry["name"], ).set(entry[col]) - seen.add(entry['name']) + seen.add(entry["name"]) def collect_ata_error_count(device): @@ -398,22 +411,22 @@ def collect_ata_error_count(device): Args: device: (Device) Device in question. """ - error_log = smart_ctl( - '-l', 'xerror,1', *device.smartctl_select(), check=False) + error_log = smart_ctl("-l", "xerror,1", *device.smartctl_select(), check=False) m = ata_error_count_re.search(error_log) error_count = m.group(1) if m is not None else 0 - metrics["device_errors"].labels( - device.base_labels["device"], device.base_labels["disk"] - ).set(error_count) + metrics["device_errors"].labels(device.base_labels["device"], device.base_labels["disk"]).set( + error_count + ) def collect_disks_smart_metrics(wakeup_disks, by_id): for device in find_devices(by_id): is_active = device_is_active(device) metrics["device_active"].labels( - device.base_labels["device"], device.base_labels["disk"], + device.base_labels["device"], + device.base_labels["disk"], ).set(is_active) # Skip further metrics collection to prevent the disk from spinning up. @@ -439,24 +452,43 @@ def collect_disks_smart_metrics(wakeup_disks, by_id): collect_device_health_self_assessment(device) - if device.type.startswith('sat'): + if device.type.startswith("sat"): collect_ata_metrics(device) collect_ata_error_count(device) def main(): + global COMPAT_DISK_LABELS + parser = argparse.ArgumentParser() - parser.add_argument('-s', '--wakeup-disks', dest='wakeup_disks', action='store_true', - help="Wake up disks to collect live stats") - parser.add_argument('--by-id', dest='by_id', action='store_true', - help="Use /dev/disk/by-id/X instead of /dev/sdX to index devices") + parser.add_argument( + "-s", + "--wakeup-disks", + dest="wakeup_disks", + action="store_true", + help="Wake up disks to collect live stats", + ) + parser.add_argument( + "--by-id", + dest="by_id", + action="store_true", + help="Use /dev/disk/by-id/X instead of /dev/sdX to index devices", + ) + parser.add_argument( + "--compat-disk-labels", + dest="compat_disk_labels", + action="store_true", + help="Invert disk and device labels to be retrocompatible with shell implementation", + ) args = parser.parse_args(sys.argv[1:]) + COMPAT_DISK_LABELS = args.compat_disk_labels + metrics["smartctl_version"].labels(smart_ctl_version()).set(1) collect_disks_smart_metrics(args.wakeup_disks, args.by_id) print(generate_latest(registry).decode(), end="") -if __name__ == '__main__': +if __name__ == "__main__": main()