Skip to content

Commit bd20fc4

Browse files
committed
test(robot): add io error test cases
- Replica Fails When Filesystem Disk Has I/O Errors longhorn/longhorn#12288 Signed-off-by: Chris Chien <chris.chien@suse.com>
1 parent fc2311d commit bd20fc4

6 files changed

Lines changed: 289 additions & 0 deletions

File tree

e2e/keywords/io.resource

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
*** Settings ***
2+
Documentation I/O error injection keywords for v1 (device mapper) and v2 (SPDK) data engines
3+
4+
Library ../libs/keywords/io_keywords.py
5+
Library ../libs/keywords/instancemanager_keywords.py
6+
Library ../libs/keywords/common_keywords.py
7+
Library ../libs/keywords/node_keywords.py
8+
9+
*** Keywords ***
10+
Add dm device as Longhorn disk on node ${node_id}
11+
[Documentation] Create a dm linear device from a Longhorn volume, format and mount it, then add it as a Longhorn disk.
12+
[Arguments] ${volume_name} ${dm_device_name} ${mount_point} ${disk_name}
13+
${node_name} = get_node_by_index ${node_id}
14+
${result} = setup_dm_linear_device ${volume_name} ${dm_device_name} ${node_name}
15+
Set Test Variable ${dm_sectors} ${result}[sectors]
16+
format_and_mount_dm_device ${dm_device_name} ${mount_point} ${node_name}
17+
add_disk ${disk_name} ${node_name} filesystem ${mount_point}
18+
RETURN ${result}
19+
20+
Create and mount dm disk from volume ${volume_name} as ${dm_device_name} on node ${node_id} to ${mount_point}
21+
[Documentation] Create a dm linear device from a Longhorn volume, format it as ext4, and mount it.
22+
${node_name} = get_node_by_index ${node_id}
23+
${result} = setup_dm_linear_device ${volume_name} ${dm_device_name} ${node_name}
24+
Set Test Variable ${dm_sectors} ${result}[sectors]
25+
format_and_mount_dm_device ${dm_device_name} ${mount_point} ${node_name}
26+
RETURN ${result}
27+
28+
Switch dm device to error mode
29+
[Documentation] Switch the dm device to the error target to simulate disk I/O errors.
30+
[Arguments] ${dm_device_name} ${node_id}
31+
${node_name} = get_node_by_index ${node_id}
32+
switch_dm_device_to_error ${dm_device_name} ${dm_sectors} ${node_name}
33+
34+
Force unmount dm disk at ${mount_point} on node ${node_id}
35+
[Documentation] Force unmount the dm-backed filesystem by killing related processes and using lazy unmount.
36+
${node_name} = get_node_by_index ${node_id}
37+
force_unmount_dm_device ${mount_point} ${node_name}
38+
39+
Cleanup mounted dm disk ${dm_device_name} at ${mount_point} on node ${node_id}
40+
[Documentation] Cleanup the dm device and unmount the mount point if it is mounted.
41+
${node_name} = get_node_by_index ${node_id}
42+
cleanup_dm_device ${dm_device_name} ${mount_point} ${node_name}

e2e/keywords/node.resource

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -180,3 +180,7 @@ Record node ${node_id} default disk uuid
180180
Wait for Longhorn node ${node_id} up
181181
${node_name} = get_node_by_index ${node_id}
182182
wait_for_longhorn_node_up ${node_name}
183+
184+
Add ${disk_type} disk ${disk_name} to node ${node_id} with path ${mount_path}
185+
${node_name} = get_node_by_index ${node_id}
186+
add_disk ${disk_name} ${node_name} ${disk_type} ${mount_path}

e2e/keywords/volume.resource

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -529,6 +529,11 @@ Volume ${volume_id} should have running replicas on node ${node_id}
529529
Set Test Variable ${node_name}
530530
Set Test Variable ${replica_count}
531531

532+
Volume ${volume_id} should have no running replica on node ${node_id}
533+
${volume_name} = generate_name_with_suffix volume ${volume_id}
534+
${node_name} = get_node_by_index ${node_id}
535+
${replica_count} = wait_for_replica_count ${volume_name} node_name=${node_name} replica_count=0
536+
532537
Volume ${volume_id} should have ${expected_replica_count} running replicas on node ${node_id} and no additional scheduling occurs
533538
Volume ${volume_id} should have ${expected_replica_count} running replicas on node ${node_id}
534539
FOR ${i} IN RANGE ${SCHEDULING_STATUS_CHECK_TIMEOUT}

e2e/libs/keywords/instancemanager_keywords.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
11
from instancemanager import InstanceManager
22

33
from utility.utility import logging
4+
from utility.utility import get_longhorn_client
5+
from utility.utility import pod_exec
6+
import utility.constant as constant
47

58

69
class instancemanager_keywords:

e2e/libs/keywords/io_keywords.py

Lines changed: 138 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,138 @@
1+
from utility.utility import logging
2+
from node_exec import NodeExec
3+
4+
5+
class io_keywords:
6+
7+
def __init__(self):
8+
pass
9+
10+
def setup_dm_linear_device(self, volume_name, dm_device_name, node_name):
11+
"""
12+
Setup a device mapper linear device from a Longhorn volume device.
13+
Steps:
14+
1. Get device path /dev/longhorn/{volume_name}
15+
2. Get device size in sectors
16+
3. Create dm-linear mapping
17+
4. Verify device creation
18+
"""
19+
logging(f"Setting up dm-linear device {dm_device_name} from volume {volume_name} on node {node_name}")
20+
21+
# Get device path
22+
device_path = f"/dev/longhorn/{volume_name}"
23+
cmd = f"test -e {device_path}"
24+
result = NodeExec(node_name).issue_cmd(cmd)
25+
if result and "No such file" in result:
26+
raise Exception(f"Device {device_path} does not exist")
27+
28+
# Get device size in sectors
29+
cmd = f"blockdev --getsz {device_path}"
30+
sectors = NodeExec(node_name).issue_cmd(cmd).strip()
31+
if not sectors or not sectors.isdigit():
32+
raise Exception(f"Failed to get device size in sectors. Result: {sectors}")
33+
34+
# Create dm-linear device
35+
cmd = f'echo "0 {sectors} linear {device_path} 0" | dmsetup create --noudevsync {dm_device_name}'
36+
create_result = NodeExec(node_name).issue_cmd(cmd)
37+
if create_result and ("failed" in create_result.lower() or "error" in create_result.lower()):
38+
raise Exception(f"Failed to create dm device. Result: {create_result}")
39+
40+
# Verify device creation
41+
cmd = f"dmsetup table --noudevsync {dm_device_name}"
42+
table = NodeExec(node_name).issue_cmd(cmd)
43+
if "failed" in table.lower() or "No such device" in table:
44+
raise Exception(f"Device verification failed. Table result: {table}")
45+
46+
result = {
47+
"real_dev": device_path,
48+
"sectors": sectors,
49+
"dm_device": f"/dev/mapper/{dm_device_name}"
50+
}
51+
logging(f"Successfully created dm device {dm_device_name}: {result}")
52+
return result
53+
54+
def format_and_mount_dm_device(self, dm_device_name, mount_point, node_name):
55+
"""Format dm device as ext4 and mount it."""
56+
logging(f"Formatting and mounting dm device {dm_device_name} to {mount_point} on node {node_name}")
57+
58+
dm_path = f"/dev/mapper/{dm_device_name}"
59+
60+
# Format device as ext4
61+
cmd = f"mkfs.ext4 -F {dm_path}"
62+
NodeExec(node_name).issue_cmd(cmd)
63+
64+
# Create mount point
65+
cmd = f"mkdir -p {mount_point}"
66+
NodeExec(node_name).issue_cmd(cmd)
67+
68+
# Mount device with errors=continue option
69+
cmd = f"mount -t ext4 -o errors=continue {dm_path} {mount_point}"
70+
NodeExec(node_name).issue_cmd(cmd)
71+
72+
logging(f"Successfully formatted and mounted {dm_device_name} to {mount_point}")
73+
74+
def force_unmount_dm_device(self, mount_point, node_name):
75+
"""Force unmount a dm device mount point by killing processes and using lazy unmount."""
76+
import time
77+
logging(f"Force unmounting {mount_point} on node {node_name}")
78+
79+
# Kill all processes using the mount point
80+
cmd = f"fuser -km {mount_point} 2>/dev/null || true"
81+
NodeExec(node_name).issue_cmd(cmd)
82+
83+
# Wait for processes to terminate
84+
timeout = 10
85+
start_time = time.time()
86+
while time.time() - start_time < timeout:
87+
cmd = f"fuser {mount_point} 2>/dev/null"
88+
result = NodeExec(node_name).issue_cmd(cmd)
89+
if not result or not result.strip():
90+
logging(f"All processes using {mount_point} have terminated")
91+
break
92+
time.sleep(0.5)
93+
else:
94+
logging(f"Warning: Timeout waiting for processes to terminate on {mount_point}")
95+
96+
# Try normal unmount first, fallback to lazy unmount
97+
cmd = f"umount {mount_point} 2>/dev/null || umount -l {mount_point} 2>/dev/null || true"
98+
NodeExec(node_name).issue_cmd(cmd)
99+
100+
logging(f"Successfully force unmounted {mount_point}")
101+
102+
def switch_dm_device_to_error(self, dm_device_name, sectors, node_name):
103+
"""Switch device mapper device from linear to error target. This will make all I/O operations fail."""
104+
logging(f"Switching dm device {dm_device_name} to error mode on node {node_name}")
105+
106+
# Load the error table (inactive)
107+
cmd = f'echo "0 {sectors} error" | dmsetup load --noudevsync {dm_device_name}'
108+
NodeExec(node_name).issue_cmd(cmd)
109+
110+
# Suspend the device without locking filesystem
111+
cmd = f"dmsetup suspend --noudevsync --nolockfs {dm_device_name}"
112+
NodeExec(node_name).issue_cmd(cmd)
113+
114+
# Resume device to activate the loaded table
115+
cmd = f"dmsetup resume --noudevsync {dm_device_name}"
116+
NodeExec(node_name).issue_cmd(cmd)
117+
118+
logging(f"Successfully switched {dm_device_name} to error mode")
119+
120+
def cleanup_dm_device(self, dm_device_name, mount_point, node_name):
121+
"""Cleanup device mapper device and unmount if mount_point is provided."""
122+
logging(f"Cleaning up dm device {dm_device_name} on node {node_name}")
123+
124+
# Lazy unmount if mount point provided
125+
if mount_point and mount_point.strip():
126+
logging(f"Unmounting {mount_point}")
127+
cmd = f"umount -l {mount_point} 2>/dev/null || true"
128+
NodeExec(node_name).issue_cmd(cmd)
129+
130+
# Wipe device mapper table
131+
cmd = f"dmsetup wipe_table --noudevsync {dm_device_name} --nolockfs 2>/dev/null || true"
132+
NodeExec(node_name).issue_cmd(cmd)
133+
134+
# Remove device mapper device (deferred)
135+
cmd = f"dmsetup remove --noudevsync --deferred {dm_device_name} 2>/dev/null || true"
136+
NodeExec(node_name).issue_cmd(cmd)
137+
138+
logging(f"Successfully cleaned up dm device {dm_device_name}")

e2e/tests/negative/io_error.robot

Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
*** Settings ***
2+
Documentation Disk I/O Error Test Cases
3+
4+
Test Tags negative disk io-error
5+
6+
Resource ../keywords/variables.resource
7+
Resource ../keywords/common.resource
8+
Resource ../keywords/volume.resource
9+
Resource ../keywords/node.resource
10+
Resource ../keywords/io.resource
11+
Resource ../keywords/longhorn.resource
12+
Resource ../keywords/host.resource
13+
14+
Test Setup Set up test environment
15+
Test Teardown Cleanup test resources
16+
17+
*** Keywords ***
18+
Setup device mapper environment
19+
Set up test environment
20+
Setup dm disk environment on node 0
21+
22+
Cleanup device mapper environment
23+
Cleanup dm disk environment on node 0 for v1
24+
Cleanup test resources
25+
26+
Setup dm disk environment on node ${node_id}
27+
[Documentation] Setup device mapper disk for testing I/O errors.
28+
... Creates a base volume, sets up dm-linear device, and adds it as a Longhorn disk.
29+
... Formats dm device and adds as filesystem disk.
30+
... Returns: dm_device_name, disk_name, and mount_path via test variables.
31+
${dm_device_name} = Generate random disk name prefix=dm-dev length=8
32+
${disk_name} = Generate random disk name prefix=dm-disk
33+
${mount_path} = Set Variable /mnt/disk
34+
${base_volume_name} = generate_name_with_suffix volume ${disk_name}
35+
Set Test Variable ${disk_name}
36+
Set Test Variable ${dm_device_name}
37+
Set Test Variable ${mount_path}
38+
39+
Log Using dm device: ${dm_device_name} for volume: ${disk_name}
40+
When Create volume ${disk_name} with size=4Gi numberOfReplicas=1 frontend=blockdev dataEngine=v1
41+
Then Attach volume ${disk_name} to node ${node_id}
42+
And Wait for volume ${disk_name} healthy
43+
44+
${dm_info} = Create and mount dm disk from volume ${base_volume_name} as ${dm_device_name} on node ${node_id} to ${mount_path}
45+
Log Device mapper info: ${dm_info}
46+
47+
When Disable node ${node_id} default disk
48+
And Add filesystem disk ${disk_name} to node ${node_id} with path ${mount_path}
49+
50+
Cleanup dm disk environment on node ${node_id} for ${data_engine}
51+
[Documentation] Cleanup device mapper disk and restore default disk.
52+
... Handles disk scheduling disable, volume deletion, unmounting (v1 only), and cleanup.
53+
When Disable disk ${disk_name} scheduling on node ${node_id}
54+
And Delete volume 2 wait=False
55+
Then Force unmount dm disk at ${mount_path} on node ${node_id}
56+
And Wait for volume 2 deleted
57+
And Delete volume ${disk_name}
58+
And Cleanup mounted dm disk ${dm_device_name} at ${mount_path} on node ${node_id}
59+
And Delete disk ${disk_name} on node ${node_id}
60+
61+
*** Test Cases ***
62+
Replica Fails When Filesystem Disk Has I/O Errors
63+
[Tags] disk io-error replica v1
64+
[Documentation] Verify that V1 replica fails when the underlying filesystem disk encounters I/O errors.
65+
...
66+
... This test uses device mapper error target to simulate disk I/O errors at the
67+
... kernel block layer. This approach works for V1 data engine which uses
68+
... filesystem-based storage.
69+
...
70+
... Test Steps:
71+
... 1. Create a 4GB base volume with 1 replica on node 0 and attach.
72+
... 2. Setup dm-linear device from the Longhorn volume device.
73+
... 3. Format dm device as ext4, mount to /mnt/disk, add as filesystem disk.
74+
... 4. Disable default disk on node 0, add dm device as Longhorn disk.
75+
... 5. Create 1GB test volume with 3 replicas, attach to node 0.
76+
... 6. Switch dm device from linear to error mode (dmsetup) to inject I/O errors.
77+
... 7. Write data to test volume to trigger I/O errors on the dm disk replica.
78+
... 8. Verify test volume becomes degraded due to failed replica.
79+
... 9. Verify no running replica on node 0 and data correct of the test volume.
80+
[Setup] Setup device mapper environment
81+
[Teardown] Cleanup device mapper environment
82+
IF '${DATA_ENGINE}' == 'v2'
83+
Skip Test only validate on v1 data engine
84+
END
85+
86+
Given Create volume 2 with size=1Gi numberOfReplicas=3 dataEngine=v1
87+
And Attach volume 2 to node 0
88+
And Wait for volume 2 healthy
89+
90+
When Switch dm device to error mode dm_device_name=${dm_device_name} node_id=0
91+
And Write data 0 500 MB to volume 2
92+
Then Wait for volume 2 degraded
93+
And Volume 2 should have no running replica on node 0
94+
And Check volume 2 data is intact
95+
96+
Replica Fails When Block Disk Has I/O Errors
97+
Skip Wiating https://github.com/longhorn/longhorn/issues/13354 resloved.

0 commit comments

Comments
 (0)