401 lines
15 KiB
Python
401 lines
15 KiB
Python
#!/usr/bin/env python2
|
|
# Copyright 2020 The Chromium OS Authors. All rights reserved.
|
|
# Use of this source code is governed by a BSD-style license that can be
|
|
# found in the LICENSE file.
|
|
|
|
import logging
|
|
import time
|
|
import re
|
|
|
|
from autotest_lib.client.common_lib import error
|
|
|
|
# Storage types supported
|
|
STORAGE_TYPE_SSD = 'ssd'
|
|
STORAGE_TYPE_NVME = 'nvme'
|
|
STORAGE_TYPE_MMC = 'mmc'
|
|
|
|
# Storage states supported
|
|
STORAGE_STATE_NORMAL = 'normal'
|
|
STORAGE_STATE_WARNING = 'warning'
|
|
STORAGE_STATE_CRITICAL = 'critical'
|
|
|
|
BADBLOCK_CHECK_RO = 'RO'
|
|
BADBLOCK_CHECK_RW = 'RW'
|
|
|
|
|
|
class StorageError(error.TestFail):
|
|
"""Custom error class to indicate is unsupported or unavailable
|
|
detect storage info.
|
|
"""
|
|
pass
|
|
|
|
|
|
class ConsoleError(error.TestFail):
|
|
"""Common error class for servod console-back control failures."""
|
|
pass
|
|
|
|
|
|
class StorageStateValidator(object):
|
|
"""Class to detect types and state of the DUT storage.
|
|
|
|
The class supporting SSD, NVME and MMC storage types.
|
|
The state detection and set state as:
|
|
- normal - drive in a good shape
|
|
- warning - drive close to the worn out state by any metrics
|
|
- critical - drive is worn out and has errors
|
|
"""
|
|
|
|
def __init__(self, host):
|
|
"""Initialize the storage validator.
|
|
|
|
@param host: cros_host object providing console access
|
|
for reading the target info.
|
|
|
|
@raises ConsoleError: if cannot read info
|
|
@raises StorageError: if info is not preset
|
|
"""
|
|
self._host = host
|
|
self._storage_type = None
|
|
self._storage_state = None
|
|
self._info = []
|
|
|
|
if not self._host:
|
|
raise StorageError('Host is not provided')
|
|
|
|
self._read_storage_info()
|
|
|
|
def _read_storage_info(self):
|
|
"""Reading the storage info from SMART
|
|
|
|
The info will be located as collection of lines
|
|
@raises StorageError: if no info provided or data unavailable
|
|
"""
|
|
logging.info('Extraction storage info')
|
|
command = '. /usr/share/misc/storage-info-common.sh; get_storage_info'
|
|
cmd_result = self._host.run(command, ignore_status=True)
|
|
if cmd_result.exit_status != 0:
|
|
raise StorageError('receive error: %s;', cmd_result.stderr)
|
|
|
|
if cmd_result.stdout:
|
|
self._info = cmd_result.stdout.splitlines()
|
|
if len(self._info) == 0:
|
|
raise StorageError('Storage info is empty')
|
|
|
|
def get_type(self):
|
|
"""Determine the type of the storage on the host.
|
|
|
|
@returns storage type (SSD, NVME, MMC)
|
|
|
|
@raises StorageError: if type not supported or not determine
|
|
"""
|
|
if not self._storage_type:
|
|
self._storage_type = self._get_storage_type()
|
|
return self._storage_type
|
|
|
|
def get_state(self, run_badblocks=None):
|
|
"""Determine the type of the storage on the host.
|
|
|
|
@param run_badblocks: string key to run badblock check.
|
|
None - check if we can run it
|
|
"NOT" - do not run check
|
|
"RW" - run read-write if booted from USB
|
|
"RO" - run read-only check
|
|
@returns storage state (normal|warning|critical)
|
|
|
|
@raises StorageError: if type not supported or state cannot
|
|
be determine
|
|
"""
|
|
if not self._storage_state:
|
|
storage_type = self.get_type()
|
|
if storage_type == STORAGE_TYPE_SSD:
|
|
self._storage_state = self._get_state_for_ssd()
|
|
elif storage_type == STORAGE_TYPE_MMC:
|
|
self._storage_state = self._get_state_for_mms()
|
|
elif storage_type == STORAGE_TYPE_NVME:
|
|
self._storage_state = self._get_state_for_nvme()
|
|
if (run_badblocks != 'NOT'
|
|
and self._storage_state != STORAGE_STATE_CRITICAL
|
|
and self._support_health_profile()):
|
|
# run badblocks if storage not in critical state
|
|
# if bad block found then mark storage as bad
|
|
logging.info('Trying run badblocks on device')
|
|
dhp = self._host.health_profile
|
|
usb_boot = self._host.is_boot_from_external_device()
|
|
if run_badblocks is None:
|
|
if _is_time_to_run_badblocks_ro(dhp):
|
|
run_badblocks = BADBLOCK_CHECK_RO
|
|
# Blocked for now till we confirm that SMART stats is not
|
|
# detect is before we do.
|
|
# if usb_boot and _is_time_to_run_badblocks_rw(dhp):
|
|
# run_badblocks = BADBLOCK_CHECK_RW
|
|
logging.debug('run_badblocks=%s', run_badblocks)
|
|
if usb_boot and run_badblocks == BADBLOCK_CHECK_RW:
|
|
self._run_read_write_badblocks_check()
|
|
dhp.refresh_badblocks_rw_run_time()
|
|
# RO is subclass of RW so update it too
|
|
dhp.refresh_badblocks_ro_run_time()
|
|
if run_badblocks == BADBLOCK_CHECK_RO:
|
|
# SMART stats sometimes is not giving issue if blocks
|
|
# bad for reading. So we run RO check.
|
|
self._run_readonly_badblocks_check()
|
|
dhp.refresh_badblocks_ro_run_time()
|
|
return self._storage_state
|
|
|
|
def _get_storage_type(self):
|
|
"""Read the info to detect type of the storage by patterns"""
|
|
logging.info('Extraction storage type')
|
|
# Example "SATA Version is: SATA 3.1, 6.0 Gb/s (current: 6.0 Gb/s)"
|
|
sata_detect = r"SATA Version is:.*"
|
|
|
|
# Example " Extended CSD rev 1.7 (MMC 5.0)"
|
|
mmc_detect = r"\s*Extended CSD rev.*MMC (?P<version>\d+.\d+)"
|
|
|
|
# Example "SMART/Health Information (NVMe Log 0x02, NSID 0xffffffff)"
|
|
nvme_detect = r".*NVMe Log .*"
|
|
|
|
for line in self._info:
|
|
if re.match(sata_detect, line):
|
|
logging.info('Found SATA device')
|
|
logging.debug('Found line => ' + line)
|
|
return STORAGE_TYPE_SSD
|
|
|
|
m = re.match(mmc_detect, line)
|
|
if m:
|
|
version = m.group('version')
|
|
logging.info('Found eMMC device, version: %s', version)
|
|
logging.debug('Found line => ' + line)
|
|
return STORAGE_TYPE_MMC
|
|
|
|
if re.match(nvme_detect, line):
|
|
logging.info('Found NVMe device')
|
|
logging.debug('Found line => ' + line)
|
|
return STORAGE_TYPE_NVME
|
|
raise StorageError('Storage type cannot be detect')
|
|
|
|
def _get_state_for_ssd(self):
|
|
"""Read the info to detect state for SSD storage"""
|
|
logging.info('Extraction metrics for SSD storage')
|
|
# Field meaning and example line that have failing attribute
|
|
# https://en.wikipedia.org/wiki/S.M.A.R.T.
|
|
# ID# ATTRIBUTE_NAME FLAGS VALUE WORST THRESH FAIL RAW_VALUE
|
|
# 184 End-to-End_Error PO--CK 001 001 097 NOW 135
|
|
ssd_fail = r"""\s*(?P<param>\S+\s\S+) # ID and attribute name
|
|
\s+[P-][O-][S-][R-][C-][K-] # flags
|
|
(\s+\d{3}){3} # three 3-digits numbers
|
|
\s+NOW # fail indicator"""
|
|
|
|
ssd_relocate_sectors = r"""\s*\d\sReallocated_Sector_Ct
|
|
\s*[P-][O-][S-][R-][C-][K-] # flags
|
|
\s*(?P<value>\d{3}) # VALUE
|
|
\s*(?P<worst>\d{3}) # WORST
|
|
\s*(?P<thresh>\d{3})# THRESH
|
|
"""
|
|
# future optimizations: read GPL and determine persentage
|
|
for line in self._info:
|
|
if re.match(ssd_fail, line):
|
|
logging.debug('Found fail line => ' + line)
|
|
return STORAGE_STATE_CRITICAL
|
|
|
|
m = re.match(ssd_relocate_sectors, line)
|
|
if m:
|
|
logging.info('Found critical line => ' + line)
|
|
value = int(m.group('value'))
|
|
# manufacture set default value 100,
|
|
# if number started to grow then it is time to mark it
|
|
if value > 100:
|
|
return STORAGE_STATE_WARNING
|
|
return STORAGE_STATE_NORMAL
|
|
|
|
def _get_state_for_mms(self):
|
|
"""Read the info to detect state for MMC storage"""
|
|
logging.debug('Extraction metrics for MMC storage')
|
|
# Ex:
|
|
# Device life time type A [DEVICE_LIFE_TIME_EST_TYP_A: 0x01]
|
|
# 0x00~9 means 0-90% band
|
|
# 0x0a means 90-100% band
|
|
# 0x0b means over 100% band
|
|
mmc_fail_lev = r""".*(?P<param>DEVICE_LIFE_TIME_EST_TYP_.)]?:
|
|
0x0(?P<val>\S)""" #life time persentage
|
|
|
|
# Ex "Pre EOL information [PRE_EOL_INFO: 0x01]"
|
|
# 0x00 - not defined
|
|
# 0x01 - Normal
|
|
# 0x02 - Warning, consumed 80% of the reserved blocks
|
|
# 0x03 - Urgent, consumed 90% of the reserved blocks
|
|
mmc_fail_eol = r".*(?P<param>PRE_EOL_INFO.)]?: 0x0(?P<val>\d)"
|
|
|
|
eol_value = 0
|
|
lev_value = -1
|
|
for line in self._info:
|
|
m = re.match(mmc_fail_lev, line)
|
|
if m:
|
|
param = m.group('val')
|
|
logging.debug('Found line for lifetime estimate => ' + line)
|
|
if 'a' == param:
|
|
val = 100
|
|
elif 'b' == param:
|
|
val = 101
|
|
else:
|
|
val = int(param)*10
|
|
if val > lev_value:
|
|
lev_value = val
|
|
continue
|
|
|
|
m = re.match(mmc_fail_eol, line)
|
|
if m:
|
|
param = m.group('val')
|
|
logging.debug('Found line for end-of-life => ' + line)
|
|
eol_value = int(param)
|
|
break
|
|
|
|
# set state based on end-of-life
|
|
if eol_value == 3:
|
|
return STORAGE_STATE_CRITICAL
|
|
elif eol_value == 2:
|
|
return STORAGE_STATE_WARNING
|
|
elif eol_value == 1:
|
|
return STORAGE_STATE_NORMAL
|
|
|
|
# set state based on life of estimates
|
|
elif lev_value < 90:
|
|
return STORAGE_STATE_NORMAL
|
|
elif lev_value < 100:
|
|
return STORAGE_STATE_WARNING
|
|
return STORAGE_STATE_CRITICAL
|
|
|
|
def _get_state_for_nvme(self):
|
|
"""Read the info to detect state for NVMe storage"""
|
|
logging.debug('Extraction metrics for NVMe storage')
|
|
# Ex "Percentage Used: 100%"
|
|
nvme_fail = r"Percentage Used:\s+(?P<param>(\d{1,3}))%"
|
|
used_value = -1
|
|
for line in self._info:
|
|
m = re.match(nvme_fail, line)
|
|
if m:
|
|
param = m.group('param')
|
|
logging.debug('Found line for usage => ' + line)
|
|
try:
|
|
val = int(param)
|
|
used_value = val
|
|
except ValueError as e:
|
|
logging.info('Could not cast: %s to int ', param)
|
|
break
|
|
|
|
if used_value < 91:
|
|
return STORAGE_STATE_NORMAL
|
|
# Stop mark device as bad when they reached 100% usage
|
|
# TODO(otabek) crbug.com/1140507 re-evaluate the max usage
|
|
return STORAGE_STATE_WARNING
|
|
|
|
def _get_device_storage_path(self):
|
|
"""Find and return the path to the device storage.
|
|
|
|
Method support detection even when the device booted from USB.
|
|
|
|
@returns path to the main device like '/dev/XXXX'
|
|
"""
|
|
# find the name of device storage
|
|
cmd = ('. /usr/sbin/write_gpt.sh;'
|
|
' . /usr/share/misc/chromeos-common.sh;'
|
|
' load_base_vars; get_fixed_dst_drive')
|
|
cmd_result = self._host.run(cmd,
|
|
ignore_status=True,
|
|
timeout=60)
|
|
if cmd_result.exit_status != 0:
|
|
logging.debug('Failed to detect path to the device storage')
|
|
return None
|
|
return cmd_result.stdout.strip()
|
|
|
|
def _run_readonly_badblocks_check(self):
|
|
"""Run backblocks readonly verification on device storage.
|
|
|
|
The blocksize set as 512 based.
|
|
"""
|
|
path = self._get_device_storage_path()
|
|
if not path:
|
|
# cannot continue if storage was not detected
|
|
return
|
|
logging.info("Running readonly badblocks check; path=%s", path)
|
|
cmd = 'badblocks -e 1 -s -b 512 %s' % path
|
|
try:
|
|
# set limit in 1 hour but expecting to finish it up 30 minutes
|
|
cmd_result = self._host.run(cmd, ignore_status=True, timeout=3600)
|
|
if cmd_result.exit_status != 0:
|
|
logging.debug('Failed to detect path to the device storage')
|
|
return
|
|
result = cmd_result.stdout.strip()
|
|
if result:
|
|
logging.debug("Check result: '%s'", result)
|
|
# So has result is Bad and empty is Good.
|
|
self._storage_state = STORAGE_STATE_CRITICAL
|
|
except Exception as e:
|
|
if 'Timeout encountered:' in str(e):
|
|
logging.info('Timeout during running action')
|
|
logging.debug(str(e))
|
|
|
|
def _run_read_write_badblocks_check(self):
|
|
"""Run non-destructive read-write check on device storage.
|
|
|
|
The blocksize set as 512 based.
|
|
We can run this test only when DUT booted from USB.
|
|
"""
|
|
path = self._get_device_storage_path()
|
|
if not path:
|
|
# cannot continue if storage was not detected
|
|
return
|
|
logging.info("Running read-write badblocks check; path=%s", path)
|
|
cmd = 'badblocks -e 1 -nsv -b 4096 %s' % path
|
|
try:
|
|
# set limit in 90 minutes but expecting to finish it up 50 minutes
|
|
cmd_result = self._host.run(cmd, ignore_status=True, timeout=5400)
|
|
if cmd_result.exit_status != 0:
|
|
logging.debug('Failed to detect path to the device storage')
|
|
return
|
|
result = cmd_result.stdout.strip()
|
|
if result:
|
|
logging.debug("Check result: '%s'", result)
|
|
# So has result is Bad and empty is Good.
|
|
self._storage_state = STORAGE_STATE_CRITICAL
|
|
except Exception as e:
|
|
if 'Timeout encountered:' in str(e):
|
|
logging.info('Timeout during running action')
|
|
logging.info('(Not critical) %s', e)
|
|
|
|
def _support_health_profile(self):
|
|
return (hasattr(self._host, 'health_profile')
|
|
and self._host.health_profile)
|
|
|
|
|
|
def _is_time_to_run_badblocks_ro(dhp):
|
|
"""Verify that device can proceed to run read-only badblocks check.
|
|
The RO check can be executed not often then one per 6 days.
|
|
|
|
@returns True if can proceed, False if not
|
|
"""
|
|
today_time = int(time.time())
|
|
last_check = dhp.get_badblocks_ro_run_time_epoch()
|
|
can_run = today_time > (last_check + (6 * 24 * 60 * 60))
|
|
if not can_run:
|
|
logging.info(
|
|
'Run RO badblocks not allowed because we have run it recently,'
|
|
' last run %s. RO check allowed to run only once per 6 days',
|
|
dhp.get_badblocks_ro_run_time())
|
|
return can_run
|
|
|
|
|
|
def _is_time_to_run_badblocks_rw(dhp):
|
|
"""Verify that device can proceed to run read-write badblocks check.
|
|
The RW check can be executed not often then one per 60 days.
|
|
|
|
@returns True if can proceed, False if not
|
|
"""
|
|
today_time = int(time.time())
|
|
last_check = dhp.get_badblocks_rw_run_time_epoch()
|
|
can_run = today_time > (last_check + (60 * 24 * 60 * 60))
|
|
if not can_run:
|
|
logging.info(
|
|
'Run RW badblocks not allowed because we have run it recently,'
|
|
' last run %s. RW check allowed to run only once per 60 days',
|
|
dhp.get_badblocks_rw_run_time())
|
|
return can_run
|