UnixPedia : HPUX / LINUX / SOLARIS: May 2018

Monday, May 28, 2018

HPUX : Device busy while vg deactivation

-- Target Instance: p sasd_tgt_instance_t 0xe0000001cacf2840 --
target state             = sasds_tgt_ready
current open count       = 1                                             <<<<<<<<
it_nxs_abt_cap           = TGT_IT_NXS_ABT_UNKNOWN
tgt_info:
tgt_hdl                  = 0x13
iport_hdl                = 0x0
tgt_sasaddr              = 0x5000c5003bfef6c9                   <<<<<<<<< this is c4t4d0
tgt_health               = SAS_HEALTH_ONLINE
iport_sasaddr            = 0x500605b002a6aab4
tgt_type                 = SAS_TGT_TYPE_SCSI
tgt_proto_cap            = SAS_TGT_PROTO_SSP_CAPABLE
tgt_topology             = SAS_TGT_TOPO_EXPANDER
slot                     = 11
enc_id                   = 0x2
tgt_enc type             = SAS_TGT_ENC_TYPE_EXT_SES2
-- Target statistics --
        tgt_open_cnt             = 31697
        tgt_close_cnt            = 31696
        tgt_scsi_layer_ios       = 143145362
        tgt_scsi_layer_io_success= 143132736
        tgt_scsi_layer_io_fails  = 12754
-- Target Instance: p sasd_tgt_instance_t 0xe0000001cad3f080 --
target state             = sasds_tgt_ready  
current open count       = 1                                             <<<<<<<<<
it_nxs_abt_cap           = TGT_IT_NXS_ABT_UNKNOWN
tgt_info:
tgt_hdl                  = 0x14
iport_hdl                = 0x0
tgt_sasaddr              = 0x5000c5003c0ab4b5                 <<<<<<<< this is c4t5d0
tgt_health               = SAS_HEALTH_ONLINE
iport_sasaddr            = 0x500605b002a6aab4
tgt_type                 = SAS_TGT_TYPE_SCSI
tgt_proto_cap            = SAS_TGT_PROTO_SSP_CAPABLE
tgt_topology             = SAS_TGT_TOPO_EXPANDER
slot                     = 12
enc_id                   = 0x2
tgt_enc type             = SAS_TGT_ENC_TYPE_EXT_SES2
-- Target statistics --
        tgt_open_cnt             = 31630
        tgt_close_cnt            = 31629
        tgt_scsi_layer_ios       = 3174756
        tgt_scsi_layer_io_success= 3162137
        tgt_scsi_layer_io_fails  = 12747
-- Target Instance: p sasd_tgt_instance_t 0xe0000001cad66040 --
target state             = sasds_tgt_ready
current open count       = 1                                             <<<<<<<<<<<
it_nxs_abt_cap           = TGT_IT_NXS_ABT_UNKNOWN
tgt_info:
tgt_hdl                  = 0x13
iport_hdl                = 0x0
tgt_sasaddr              = 0x5000c5003c099ddd                 <<<<<<<<<< this is c5t4d0
tgt_health               = SAS_HEALTH_ONLINE
iport_sasaddr            = 0x500605b002a697c4
tgt_type                 = SAS_TGT_TYPE_SCSI
tgt_proto_cap            = SAS_TGT_PROTO_SSP_CAPABLE
tgt_topology             = SAS_TGT_TOPO_EXPANDER
slot                     = 11
enc_id                   = 0x2
tgt_enc type             = SAS_TGT_ENC_TYPE_EXT_SES2
-- Target statistics --
        tgt_open_cnt             = 31692
        tgt_close_cnt            = 31691
        tgt_scsi_layer_ios       = 99698532
        tgt_scsi_layer_io_success= 99685901
        tgt_scsi_layer_io_fails  = 12758
-- Target Instance: p sasd_tgt_instance_t 0xe0000001cad68040 --
target state             = sasds_tgt_ready
current open count       = 1                                             <<<<<<<<<
it_nxs_abt_cap           = TGT_IT_NXS_ABT_UNKNOWN
tgt_info:
tgt_hdl                  = 0x14
iport_hdl                = 0x0
tgt_sasaddr              = 0x5000c5003c0af631                  <<<<<<<< this is c5t5d0
tgt_health               = SAS_HEALTH_ONLINE
iport_sasaddr            = 0x500605b002a697c4
tgt_type                 = SAS_TGT_TYPE_SCSI
tgt_proto_cap            = SAS_TGT_PROTO_SSP_CAPABLE
tgt_topology             = SAS_TGT_TOPO_EXPANDER
slot                     = 12
enc_id                   = 0x2
tgt_enc type             = SAS_TGT_ENC_TYPE_EXT_SES2
-- Target statistics --
        tgt_open_cnt             = 31621
        tgt_close_cnt            = 31620
        tgt_scsi_layer_ios       = 3173364
        tgt_scsi_layer_io_success= 3160744
        tgt_scsi_layer_io_fails  = 12747
From ioscan:
target      10  0/3/0/0/0/0.0.0.4            tgt          CLAIMED     DEVICE
disk         6  0/3/0/0/0/0.0.0.4.0          sdisk        CLAIMED     DEVICE       HP      EG0300FAWHV
                              /dev/dsk/c4t4d0   /dev/rdsk/c4t4d0
        Acpi(HPQ0002,PNP0A08,300)/Pci(0|0)/Pci(0|0)/Sas(Addr5000C5003BFEF6C9, Lun0)
target      11  0/3/0/0/0/0.0.0.5            tgt          CLAIMED     DEVICE
disk         7  0/3/0/0/0/0.0.0.5.0          sdisk        CLAIMED     DEVICE       HP      EG0300FAWHV
                              /dev/dsk/c4t5d0   /dev/rdsk/c4t5d0
        Acpi(HPQ0002,PNP0A08,300)/Pci(0|0)/Pci(0|0)/Sas(Addr5000C5003C0AB4B5, Lun0)
target      17  0/6/0/0/0/0/2/0/0/0.0.0.4    tgt          CLAIMED     DEVICE
disk        12  0/6/0/0/0/0/2/0/0/0.0.0.4.0  sdisk        CLAIMED     DEVICE       HP      EG0300FAWHV
                              /dev/dsk/c5t4d0   /dev/rdsk/c5t4d0
        Acpi(HPQ0002,PNP0A08,600)/Pci(0|0)/Pci(0|0)/Pci(2|0)/Pci(0|0)/Sas(Addr5000C5003C099DDD, Lun0)
target      18  0/6/0/0/0/0/2/0/0/0.0.0.5    tgt          CLAIMED     DEVICE
disk        13  0/6/0/0/0/0/2/0/0/0.0.0.5.0  sdisk        CLAIMED     DEVICE       HP      EG0300FAWHV
                              /dev/dsk/c5t5d0   /dev/rdsk/c5t5d0
        Acpi(HPQ0002,PNP0A08,600)/Pci(0|0)/Pci(0|0)/Pci(2|0)/Pci(0|0)/Sas(Addr5000C5003C0AF631, Lun0)
### strings /etc/lvmtab ###
/dev/vg00
/dev/dsk/c3t0d0s2
/dev/dsk/c3t0d1s2
/dev/vg01
/dev/dsk/c4t4d0
/dev/dsk/c5t4d0
/dev/vg04
/dev/dsk/c4t5d0
/dev/dsk/c5t5d0
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
vg01 is active and the file systems on its lvols are mounted on jupiterA. Hence, it makes sense the LUNs will have their "current open count" as "1"
Interestingly, luns that are part of vg04 also have their open count 1 which means they are in use.
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
/4707390189-473/sysinfo_jupiterA# grep pvchange syslog.log
Aug  5 22:22:01 jupiterA LVM[29891]: pvchange -a y /dev/dsk/c4t5d0
Aug  5 22:22:16 jupiterA LVM[29920]: pvchange -a y /dev/dsk/c5t5d0
Aug  5 22:24:38 jupiterA LVM[1693]: pvchange -a n /dev/dsk/c4t5d0
Aug  5 22:25:48 jupiterA LVM[2042]: pvchange -a y /dev/dsk/c4t5d0
Aug  6 00:07:34 jupiterA LVM[7052]: pvchange -a y /dev/dsk/c4t5d0
Aug  6 00:07:47 jupiterA LVM[7070]: pvchange -a y /dev/dsk/c5t5d0
Aug  6 00:37:04 jupiterA LVM[16730]: pvchange -a N /dev/dsk/c4t5d0
Aug  6 00:37:14 jupiterA LVM[16785]: pvchange -a N /dev/dsk/c5t5d0
Aug  6 00:44:28 jupiterA LVM[12621]: pvchange -a y /dev/dsk/c4t5d0
Aug  6 00:44:36 jupiterA LVM[13366]: pvchange -a y /dev/dsk/c5t5d0
/4707390189-473/sysinfo_jupiterA# grep vgchange syslog.log
Aug  5 22:42:15 jupiterA LVM[6987]: vgchange -a r /dev/vg04
Aug  6 00:09:44 jupiterA LVM[7633]: vgchange -a r /dev/vg04
Aug  6 00:44:54 jupiterA LVM[14807]: vgchange -a r /dev/vg04
/4707390189-473/sysinfo_jupiterA# grep pvchange syslog.log
Aug  5 22:22:01 jupiterA LVM[29891]: pvchange -a y /dev/dsk/c4t5d0
Aug  5 22:22:16 jupiterA LVM[29920]: pvchange -a y /dev/dsk/c5t5d0
Aug  5 22:24:38 jupiterA LVM[1693]: pvchange -a n /dev/dsk/c4t5d0
Aug  5 22:25:48 jupiterA LVM[2042]: pvchange -a y /dev/dsk/c4t5d0
Aug  6 00:07:34 jupiterA LVM[7052]: pvchange -a y /dev/dsk/c4t5d0
Aug  6 00:07:47 jupiterA LVM[7070]: pvchange -a y /dev/dsk/c5t5d0
Aug  6 00:37:04 jupiterA LVM[16730]: pvchange -a N /dev/dsk/c4t5d0
Aug  6 00:37:14 jupiterA LVM[16785]: pvchange -a N /dev/dsk/c5t5d0
Aug  6 00:44:28 jupiterA LVM[12621]: pvchange -a y /dev/dsk/c4t5d0
Aug  6 00:44:36 jupiterA LVM[13366]: pvchange -a y /dev/dsk/c5t5d0
+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
I've noticed the following I/O related entries which are all logged for vg 0x040000 which is nothing but
+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
crw-r-----   1 root       sys         64 0x000000 Sep 28  2011 /dev/vg00/group
crw-r--r--   1 root       sys         64 0x010000 Sep 30  2011 /dev/vg01/group
crw-r--r--   1 root       sys         64 0x040000 Sep 30  2011 /dev/vg04/group
Aug  6 00:37:04 jupiterA vmunix: LVM: VG 64 0x040000: Flushing the deferred attach list.
Aug  6 00:37:04 jupiterA vmunix: LVM: VG 64 0x040000: PVLink 31 0x045000 Detached.
Aug  6 00:37:04 jupiterA LVM[16730]: pvchange -a N /dev/dsk/c4t5d0
Aug  6 00:37:14 jupiterA vmunix: LVM: VG 64 0x040000: PVLink 31 0x055000 Detached.
Aug  6 00:37:14 jupiterA LVM[16785]: pvchange -a N /dev/dsk/c5t5d0
Aug  6 00:37:14 jupiterA vmunix: LVM: NOTICE: VG 64 0x040000: LV 1: All I/O requests to this LV that were
Aug  6 00:37:14 jupiterA vmunix: LVM: VG 64 0x040000: Flushing the deferred attach list.
Aug  6 00:37:14 jupiterA vmunix:        waiting indefinitely for an unavailable PV have now completed.
Aug  6 00:44:28 jupiterA LVM[12621]: pvchange -a y /dev/dsk/c4t5d0
Aug  6 00:44:28 jupiterA vmunix: LVM: WARNING: VG 64 0x040000: LV 4: Some I/O requests to this LV are waiting
Aug  6 00:44:28 jupiterA vmunix:        indefinitely for an unavailable PV. These requests will be queued until
Aug  6 00:44:28 jupiterA vmunix:        the PV becomes available (or a timeout is specified for the LV).
Aug  6 00:44:28 jupiterA vmunix: LVM: WARNING: VG 64 0x040000: LV 7: Some I/O requests to this LV are waiting
Aug  6 00:44:28 jupiterA vmunix: LVM: WARNING: VG 64 0x040000: LV 9: Some I/O requests to this LV are waiting
Aug  6 00:44:28 jupiterA vmunix: LVM: WARNING: VG 64 0x040000: LV 8: Some I/O requests to this LV are waiting
Aug  6 00:44:28 jupiterA vmunix: LVM: WARNING: VG 64 0x040000: LV 10: Some I/O requests to this LV are waiting
Aug  6 00:44:28 jupiterA vmunix: LVM: WARNING: VG 64 0x040000: LV 14: Some I/O requests to this LV are waiting
Aug  6 00:44:28 jupiterA vmunix: LVM: WARNING: VG 64 0x040000: LV 16: Some I/O requests to this LV are waiting
Aug  6 00:44:29 jupiterA vmunix: LVM: WARNING: VG 64 0x040000: LV 5: Some I/O requests to this LV are waiting
Aug  6 00:44:29 jupiterA vmunix: LVM: WARNING: VG 64 0x040000: LV 13: Some I/O requests to this LV are waiting
Aug  6 00:44:29 jupiterA vmunix: LVM: WARNING: VG 64 0x040000: LV 1: Some I/O requests to this LV are waiting
Aug  6 00:44:36 jupiterA LVM[13366]: pvchange -a y /dev/dsk/c5t5d0
Aug  6 00:44:54 jupiterA LVM[14807]: vgchange -a r /dev/vg04
Aug  6 00:44:29 jupiterA vmunix:        indefinitely for an unavailable PV. These requests will be queued until
Aug  6 00:45:04 jupiterA  above message repeats 9 times
/4707390189-473/sysinfo_jupiterA# grep "LVM: NOTICE" syslog.log
Aug  6 00:37:14 jupiterA vmunix: LVM: NOTICE: VG 64 0x040000: LV 1: All I/O requests to this LV that were
hpuxftp@HPUXFTP_b8u8:/home/hpuxftp/crashdump/4707390189-473/sysinfo_jupiterA# grep "LVM: WARNING" syslog.log
Aug  6 00:44:28 jupiterA vmunix: LVM: WARNING: VG 64 0x040000: LV 4: Some I/O requests to this LV are waiting
Aug  6 00:44:28 jupiterA vmunix: LVM: WARNING: VG 64 0x040000: LV 7: Some I/O requests to this LV are waiting
Aug  6 00:44:28 jupiterA vmunix: LVM: WARNING: VG 64 0x040000: LV 9: Some I/O requests to this LV are waiting
Aug  6 00:44:28 jupiterA vmunix: LVM: WARNING: VG 64 0x040000: LV 8: Some I/O requests to this LV are waiting
Aug  6 00:44:28 jupiterA vmunix: LVM: WARNING: VG 64 0x040000: LV 10: Some I/O requests to this LV are waiting
Aug  6 00:44:28 jupiterA vmunix: LVM: WARNING: VG 64 0x040000: LV 14: Some I/O requests to this LV are waiting
Aug  6 00:44:28 jupiterA vmunix: LVM: WARNING: VG 64 0x040000: LV 16: Some I/O requests to this LV are waiting
Aug  6 00:44:29 jupiterA vmunix: LVM: WARNING: VG 64 0x040000: LV 5: Some I/O requests to this LV are waiting
Aug  6 00:44:29 jupiterA vmunix: LVM: WARNING: VG 64 0x040000: LV 13: Some I/O requests to this LV are waiting
Aug  6 00:44:29 jupiterA vmunix: LVM: WARNING: VG 64 0x040000: LV 1: Some I/O requests to this LV are waiting
Conclusion:
1. Although, vg04 was activated read-only we see that there are I/O requests to quite a number of its LVs.
2. You would not be able to deactivate this vg till they all either complete or timeout. If there are any application that you know may use data on this vg, you may shutdown the application and try to deactivate. Else, the last option is to reboot the node (please take necessary measures since it is a cluster node.)
3. The LVM subsystem thinks that the PVs are unavailable. Bearing in mind that the I/O requests are waiting for an unavailable PV, the possibilities are PVs are too busy or there is some delay at the connectivity-level(between this node and to the PVs)
4. Since, there are two PVs on two different HW paths, the point of failure must be common to both. Could you send me the diagram of this set-up (connectivity)?
WHAT HAPPENS WHEN A NODE TIMES OUT

Each node sends a heartbeat message to all other nodes at an interval equal to one-fourth of the
value of the configured MEMBER_TIMEOUT or 1 second, whichever is less.

When a node detects that another node has failed (that is, no heartbeat message has arrived
within MEMBER_TIMEOUT microseconds), the following sequence of events occurs:

1. The node contacts the other nodes and tries to re-form the cluster without the failed node.
2. If the remaining nodes are a majority or can obtain the cluster lock, they form a new cluster
without the failed node.
3. If the remaining nodes are not a majority or cannot get the cluster lock, they halt (system reset).

HEALTHY NODE STATUS:


INCASE  OF FAILURE :

EXAMPLE
SITUATION.
Assume a two-node cluster, with Package1 running on JUPITOR and Package2 running
on EARTH. Volume group vg01 is exclusively activated on JUPITOR; volume group vg02is
exclusively activated on EARTH. Package IP addresses are assigned to JUPITOR and EARTH
respectively.

FAILURE.
Only one LAN has been configured for both heartbeat and data traffic. During the course
of operations, heavy application traffic monopolizes the bandwidth of the network, preventing
heartbeat packets from getting through.

Since JUPITOR does not receive heartbeat messages from EARTH, JUPITOR attempts to reform
as a one-node cluster. Likewise, since EARTH does not receive heartbeat messages from
JUPITOR, EARTH also attempts to reform as a one-node cluster.

ELECTION PROCESS:
During the election protocol,each node votes for itself, giving both nodes 50 percent of the vote.
Because both nodes have 50 percent of the vote, both nodes now vie for the cluster lock.
Only one node will get the lock.

OUTCOME.
Assume JUPITOR gets the cluster lock. JUPITOR reforms as a one-node cluster. After
re-formation, JUPITOR will make sure all applications configured to run on an existing cluster
node are running. When JUPITOR discovers Package2 is not running in the cluster it will try to
start Package2 if Package2 is configured to run on JUPITOR.
EARTH recognizes that it has failed to get the cluster lock and so cannot re-form the cluster. To
release all resources related to Package2 (such as exclusive access to volume group vg02 and

the Package2 IP address) as quickly as possible, EARTH halts (system reset).

HPUX : Virtual partition

       Command    |                       Description
      _____________|_________________________________________________________
      vecheck | Check for virtual partition environment.
                    |
      vparadmi| Modify virtual partition flexible administrative
                    | capability related attributes.
                    |
      vparboot| Boot (start) a virtual partition.
                    |
      vparcreate    | Create a new virtual partition.
                    |
     vpardump| Manage monitor dump files.
                    |
     vparefiutil| Update EFI device paths of bootable disks in the
                    | vPar database.  Itanium(R)-based platforms only.
                    |
      vparenv  | Set vPars or nPars  mode, ILM or CLM granularity in
                    | system firmware, or display current values of these
                    | settings.  Itanium-based platforms only.
                    |
      vparextract   | Extract memory images from a running virtual partition
                    | system.
                    |
      vparmodify    | Modify an existing virtual partition.
                    |
      vparreloc     | Relocate the load address of a vmunix file, determine
                    | if a vmunix file is relocatable, or promote the scope of
                    | symbols in a relocatable vmunix file.
                    | PA-RISC platforms only.
                    |
    vparremove    | Remove (delete) an existing virtual partition.
                    |
      vparreset     | Send a hard reset (TOC) to a virtual partition.
                    |
      vparresources | Description of virtual partition resources and
                    | their requirements.
                    |
      vparstatus    | Display virtual partition and available resources
                    | information.
                    |
      vparutil      | Get and set SCSI parameters for disk devices from
                       | a virtual partition.  PA-RISC platforms only.

HPUX : Creating a new INDEX



Here are the errors I was provided:

ERROR:   Could not validate the viability of updating from
         B.11.31:HPUX11i-VSE-OE HPUX11i-VSE-OE to HPUX11i-VSE-OE.
ERROR:   The script "/var/adm/sw/pre_update/update_ok.001" returned a value of "1" (ERROR)
       * Running "/var/adm/sw/pre_update/update_ok.001". failed with 2 errors.

The above errors occur when there are multiple " HPUX11i-VSE-OE " bundles listed in swlist.   To work around this condition you must remove the older HPUX11i-VSE-OE swlist entry as follows:

- First I would recommend making a tar or other backup of /var/adm/sw/products, so you can get back to the current state if needed.

- Locate HPUX11i-VSE-OE directories:
#cd /var/adm/sw/products

#ls -d HPUX11i* 


Move the older HPUX11i-VSE-OE  directory and the /var/adm/sw/products/INDEX to /tmp.
Example:
# mv   HPUX11i-VSE-OE.2 /tmp
# INDEX /tmp

- Recreate the Installed Products Database (/var/adm/sw/products/INDEX ).

Create the void.psf file with the following contents:
#cat  void.psf
product
tag void
fileset
tag void

Create dummy void product used to rebuild the INDEX file:
#swpackage -s ./ void.psf  
#swinstall void


A new INDEX file should now exist.

- Verify that swlist no longer shows duplicate HPUX11i-VSE-OE  listings.

Cleanup:
#swremove void 
#rm void.psf  
#swremove -d void

The pre_update update_ok.001 script called by update-ux should no longer error provided swlist shows a single OE listed.


VERITAS :File system need full fsck when I/O error occurred while reading the inode list.:.



File system need full fsck when I/O error occurred while reading the inode list.:.
Overview
File system need full fsck when I/O error occurred while reading the inode list..
Procedures
ERROR MESSAGE:

Jun 11 21:49:29 ebzdbp14 vmunix: vxfs: WARNING: msgcnt 3 mesg 079: V-2-79: vx_tranuninode - /db04 file system inode 120 marked bad ondisk
Jun 11 21:49:31 ebzdbp14 vmunix: vxfs: WARNING: msgcnt 4 mesg 016: V-2-16: vx_ilisterr: vx_iupdat_local_0 - /db04 file system error reading inode 120
Jun 11 22:24:15 ebzdbp14 vmunix: vxfs: WARNING: msgcnt 6 mesg 008: V-2-8: vx_direrr: vx_readdir2_3 - /db04 file system dir inode 16385 dev/block 0/22490472 dirent inode 0 error 6
Jun 11 22:24:19 ebzdbp14 vmunix: vxfs: WARNING: msgcnt 7 mesg 008: V-2-8: vx_direrr: vx_readdir2_3 - /db04 file system dir inode 16385 dev/block 0/22490472 dirent inode 0 error 6
Jun 11 22:26:51 ebzdbp14 vmunix: vxfs: WARNING: msgcnt 8 mesg 008: V-2-8: vx_direrr: vx_readdir2_3 - /db04 file system dir inode 16385 dev/block 0/22490472 dirent inode 0 error 6
Jun 11 22:26:52 ebzdbp14 vmunix: vxfs: WARNING: msgcnt 9 mesg 008: V-2-8: vx_direrr: vx_readdir2_3 - /db04 file system dir inode 16385 dev/block 0/22490472 dirent inode 0 error 6
Jun 11 22:27:31 ebzdbp14 vmunix: vxfs: WARNING: msgcnt 10 mesg 008: V-2-8: vx_direrr: vx_readdir2_3 - /db04 file system dir inode 16385 dev/block 0/22490472 dirent inode 0 error 6
Jun 11 22:27:47 ebzdbp14 vmunix: vxfs: WARNING: msgcnt 11 mesg 008: V-2-8: vx_direrr: vx_readdir2_3 - /db04 file system dir inode 16385 dev/block 0/22490472 dirent inode 0 error 6


DETAILED DESCRIPTION

An I/O error occurred while reading the inode list. The VX_FULLFSCK flag is set.

When inode information is no longer dependable, the kernel marks it bad on disk. The most common reason for marking an inode bad is a disk I/O failure. If there is an I/O failure in the inode list, on a directory block, or an indirect address extent, the integrity of the data in the inode, or the data the kernel tried to write to the inode list, is questionable. In these cases, the disk driver prints an error message and one or more inodes are marked bad.

The kernel also marks an inode bad if it finds a bad extent address, invalid inode fields, or corruption in directory data blocks during a validation check. A validation check failure indicates the file system has been corrupted. This usually occurs because a user or process has written directly to the device or used fsdb to change the file system.

The VX_FULLFSCK flag is set in the super-block so fsck will do a full structural check the next time it is run.


We verified the /db04 file system related disk information.

The below disk is used to create /db04 file system and one of the disk status is showing as "failing" status.


From RACDG disk group information.

c5t1d0       auto:cdsdisk    racdisk6     racdg        online shared failing
c5t0d7       auto:cdsdisk    racdisk5     racdg        online shared
c5t1d4       auto:cdsdisk    racdisk0     racdg        online shared
c3t2d6       auto:cdsdisk    racdisk13    racdg        online shared
c3t3d0       auto:cdsdisk    racdisk15    racdg        online shared
c5t0d3       auto:cdsdisk    racdisk1     racdg        online shared
c5t0d4       auto:cdsdisk    racdisk2     racdg        online shared
c7t2d3       auto:cdsdisk    racdisk18    racdg        online shared
c7t2d4       auto:cdsdisk    racdisk19    racdg        online shared
c5t0d6       auto:cdsdisk    racdisk4     racdg        online shared


From  crsdg  disk group information.

c5t1d6       auto:cdsdisk    crsdsk1      crsdg        online shared failing
c5t1d7       auto:cdsdisk    crsdsk2      crsdg        online shared failing
c5t2d0       auto:cdsdisk    crsdsk3      crsdg        online shared failing


Veritas Storage Foundation lists the status of a disk as "failing" in response to errors that are detected while reading or writing to a disk. The status is designed to draw administrative attention to disks that have experienced errors. Reviewing the status of the disks in the disk array, as well as any connected storage area network (SAN) components, is recommended to determine if a hardware problem exists.

Since it is possible for a disk to be flagged as "failing" in response to an isolated event, this status does not necessarily mean that the disks have a hardware problem.


SOLUTION:

Check the console log for I/O errors. If the problem is a disk failure, replace the disk. If the problem is not related to an I/O failure, find out how the disk became corrupted. If no user or process is writing to the device, report the problem to your customer support organization. In either case, unmount the file system and use fsck to run a full structural check.

1. Check the disk array as well as any connected SAN components for hardware problems.
2. Review the messages log, for the operating system, for events that refer to disk read and write errors.
3. If no persistent I/O errors are discovered, it may be that the "failing" status was triggered by transient error rather than a truly failing disk. In this case, you can simply clear the status. If the failing status continues to reappear for the same disk, it may be a sign that there is genuine, hardware problem with the disk, or with the SAN connectivity.

Kindly verify the  external storage device status with SAN team.

To clear failing flag kindly run below commands.


#vxedit -g racdg set failing=off racdisk6

#vxedit -g crsdg set failing=off crsdsk1      

#vxedit -g crsdg set failing=off crsdsk2

#vxedit -g crsdg set failing=off crsdsk3      

REFERENCE:





Keywords.
veritas