summaryrefslogtreecommitdiff
path: root/tests/test_gpu_container.sh
blob: 593f927bf2a3b08b598bd5d35616539e8a6bc250 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
#!/bin/bash

# Container Test Script for rocky_dev_gpu:latest
# This script tests all the functionality of the GPU-enabled container

set -e

CONTAINER_NAME="rocky_dev_gpu_test_$$"
IMAGE_NAME="rocky_dev_gpu:latest"
TEST_PORT=$(shuf -i 40000-50000 -n 1)

# Cleanup function
cleanup() {
    echo ""
    echo "Cleaning up..."
    podman stop $CONTAINER_NAME >/dev/null 2>&1 || true
    podman rm $CONTAINER_NAME >/dev/null 2>&1 || true
    echo "Container $CONTAINER_NAME removed"
}

# Set trap to cleanup on exit
trap cleanup EXIT

echo "=== Rocky Dev GPU Container Test Suite ==="
echo "Container: $CONTAINER_NAME"
echo "Port: $TEST_PORT"
echo ""

# Function to run commands in container
run_in_container() {
    podman exec $CONTAINER_NAME bash -c "$1"
}

# Function to check if command exists
check_command() {
    local cmd=$1
    echo -n "Checking $cmd... "
    if run_in_container "command -v $cmd" >/dev/null 2>&1; then
        echo "✓"
        return 0
    else
        echo "✗"
        return 1
    fi
}

# Start container with GPU support
echo "1. Starting GPU container..."
podman run -d -p ${TEST_PORT}:22 --device nvidia.com/gpu=all --name $CONTAINER_NAME $IMAGE_NAME
sleep 5

echo ""
echo "2. Testing base container functionality..."
echo "(Inherited from rocky_dev:latest)"

# Quick check of base tools
echo -n "Development tools: "
for cmd in gcc g++ make cmake git python3; do
    run_in_container "command -v $cmd" >/dev/null 2>&1 || { echo "✗ Missing $cmd"; exit 1; }
done
echo "✓"

echo -n "Rust toolchain: "
run_in_container "source /root/.cargo/env && cargo --version" >/dev/null 2>&1 && echo "✓" || echo "✗"

echo -n "Node.js: "
run_in_container "source /root/.nvm/nvm.sh && node --version" >/dev/null 2>&1 && echo "✓" || echo "✗"

echo ""
echo "3. Testing GPU-specific packages..."
# Check for GPU utilities
check_command lspci
check_command nvidia-smi || echo "  (nvidia-smi requires actual GPU hardware)"

# Check for kernel packages
echo -n "Checking kernel headers... "
run_in_container "rpm -q kernel-headers" >/dev/null 2>&1 && echo "✓" || echo "✗"

echo -n "Checking kernel-devel... "
run_in_container "rpm -q kernel-devel" >/dev/null 2>&1 && echo "✓" || echo "✗"

echo -n "Checking pciutils... "
run_in_container "rpm -q pciutils" >/dev/null 2>&1 && echo "✓" || echo "✗"

echo ""
echo "4. Testing NVIDIA container toolkit..."
echo -n "Checking nvidia-container-toolkit... "
run_in_container "rpm -q nvidia-container-toolkit" >/dev/null 2>&1 && echo "✓" || echo "✗"

echo ""
echo "5. Testing GPU environment variables..."
# Check environment variables
echo -n "NVIDIA_VISIBLE_DEVICES... "
run_in_container "echo \$NVIDIA_VISIBLE_DEVICES" | grep -q "all" && echo "✓ Set to 'all'" || echo "✗ Not set correctly"

echo -n "NVIDIA_DRIVER_CAPABILITIES... "
run_in_container "echo \$NVIDIA_DRIVER_CAPABILITIES" | grep -q "compute,utility" && echo "✓ Set to 'compute,utility'" || echo "✗ Not set correctly"

echo ""
echo "6. Testing GPU test script..."
# Check if gpu-test.sh exists and is executable
echo -n "Checking /usr/local/bin/gpu-test.sh... "
run_in_container "test -x /usr/local/bin/gpu-test.sh" && echo "✓ Exists and executable" || echo "✗ Not found or not executable"

# Run the GPU test script
echo ""
echo "Running GPU test script:"
echo "------------------------"
run_in_container "/usr/local/bin/gpu-test.sh" || echo "Note: Some GPU tests may fail without actual GPU hardware"
echo "------------------------"

echo ""
echo "7. Testing workspace directory..."
# Check workspace directory
echo -n "Checking /workspace directory... "
run_in_container "test -d /workspace" && echo "✓ Exists" || echo "✗ Not found"

echo ""
echo "8. Testing PCI device detection..."
# Try to detect any NVIDIA devices
echo "PCI devices (filtered for NVIDIA/GPU):"
run_in_container "lspci 2>/dev/null | grep -iE '(nvidia|vga|3d|display)' || echo '  No GPU devices detected (this is normal without GPU hardware)'"

echo ""
echo "9. Testing container GPU device access..."
# Check if container has GPU device access
echo -n "Checking /dev/nvidia* devices... "
if run_in_container "ls /dev/nvidia* 2>/dev/null" >/dev/null 2>&1; then
    echo "✓ GPU devices found"
    run_in_container "ls -la /dev/nvidia*"
else
    echo "✗ No GPU devices (normal without GPU hardware)"
fi

echo ""
echo "=== Test Summary ==="
echo "GPU Support Status:"
if run_in_container "command -v nvidia-smi && nvidia-smi" >/dev/null 2>&1; then
    echo "  ✓ Full GPU support detected"
else
    echo "  ⚠ GPU tools installed but no GPU hardware detected"
    echo "  This is normal when running without NVIDIA GPU"
fi
echo ""
echo "All tests completed successfully!"
echo "Container will be automatically cleaned up."