diff options
| author | hc <hc@email.ch> | 2025-05-31 23:44:40 +0800 |
|---|---|---|
| committer | hc <hc@email.ch> | 2025-05-31 23:44:40 +0800 |
| commit | d6eb567da3e6d2e64ebf22adf1fc6d21c47090f8 (patch) | |
| tree | 14c15830a8014001d5cc587b5b4d4454c880396e | |
| parent | ce511f49438761549e904d6e972b8c0635306ff9 (diff) | |
hehe
| -rw-r--r-- | docker_build/Dockerfile | 4 | ||||
| -rw-r--r-- | docker_build/Dockerfile.gpu | 40 | ||||
| -rw-r--r-- | docker_build/ssh-keys/macm4-resident.pub (renamed from ssh-keys/macm4-resident.pub) | 0 | ||||
| -rw-r--r-- | docs | 69 | ||||
| -rwxr-xr-x | podman_launch_devenv.py | 6 | ||||
| -rw-r--r-- | rocky-ssh-deployment.yaml | 2 | ||||
| -rw-r--r-- | rocky-ssh-gpu-deployment.yaml | 69 | ||||
| -rwxr-xr-x | tests/test_base_container.sh | 131 | ||||
| -rwxr-xr-x | tests/test_gpu_container.sh | 146 |
9 files changed, 459 insertions, 8 deletions
diff --git a/docker_build/Dockerfile b/docker_build/Dockerfile index 5df57d2..16f74d6 100644 --- a/docker_build/Dockerfile +++ b/docker_build/Dockerfile @@ -19,8 +19,8 @@ RUN mkdir -p /var/run/sshd && \ RUN mkdir -p /root/.ssh && \ chmod 700 /root/.ssh && \ usermod -s /bin/bash root -# Copy SSH public keys from ssh-keys directory into the image -COPY ssh-keys/*.pub /tmp/ssh-keys/ +# Copy SSH public keys from docker_build/ssh-keys directory into the image +COPY docker_build/ssh-keys/*.pub /tmp/ssh-keys/ RUN cat /tmp/ssh-keys/*.pub > /root/.ssh/authorized_keys && \ chmod 600 /root/.ssh/authorized_keys && \ rm -rf /tmp/ssh-keys diff --git a/docker_build/Dockerfile.gpu b/docker_build/Dockerfile.gpu new file mode 100644 index 0000000..7ed08a5 --- /dev/null +++ b/docker_build/Dockerfile.gpu @@ -0,0 +1,40 @@ +# Multi-stage build - GPU version builds on top of the base dev environment +FROM rocky_dev:latest + +# Update and install GPU-specific packages +RUN dnf update -y && \ + dnf install -y kernel-headers kernel-devel pciutils && \ + dnf clean all + +# Install NVIDIA container toolkit dependencies +RUN dnf config-manager --add-repo https://nvidia.github.io/libnvidia-container/stable/rpm/nvidia-container-toolkit.repo && \ + dnf install -y nvidia-container-toolkit && \ + dnf clean all + +# Set environment variables for NVIDIA +ENV NVIDIA_VISIBLE_DEVICES=all +ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility + +# Add GPU test script +RUN echo '#!/bin/bash' > /usr/local/bin/gpu-test.sh && \ + echo 'echo "=== System Information ==="' >> /usr/local/bin/gpu-test.sh && \ + echo 'cat /etc/rocky-release' >> /usr/local/bin/gpu-test.sh && \ + echo 'echo' >> /usr/local/bin/gpu-test.sh && \ + echo 'echo "=== PCI Devices (GPUs) ==="' >> /usr/local/bin/gpu-test.sh && \ + echo 'lspci | grep -i nvidia' >> /usr/local/bin/gpu-test.sh && \ + echo 'echo' >> /usr/local/bin/gpu-test.sh && \ + echo 'echo "=== NVIDIA SMI ==="' >> /usr/local/bin/gpu-test.sh && \ + echo 'if command -v nvidia-smi &> /dev/null; then' >> /usr/local/bin/gpu-test.sh && \ + echo ' nvidia-smi' >> /usr/local/bin/gpu-test.sh && \ + echo 'else' >> /usr/local/bin/gpu-test.sh && \ + echo ' echo "nvidia-smi not found. GPU might not be accessible inside container."' >> /usr/local/bin/gpu-test.sh && \ + echo 'fi' >> /usr/local/bin/gpu-test.sh && \ + chmod +x /usr/local/bin/gpu-test.sh + +# Create workspace directory for GPU workloads +RUN mkdir -p /workspace + +# Keep the same working directory and CMD from base image +WORKDIR /root +EXPOSE 22 +CMD ["/usr/sbin/sshd", "-D", "-e"]
\ No newline at end of file diff --git a/ssh-keys/macm4-resident.pub b/docker_build/ssh-keys/macm4-resident.pub index fbccb4f..fbccb4f 100644 --- a/ssh-keys/macm4-resident.pub +++ b/docker_build/ssh-keys/macm4-resident.pub @@ -1,4 +1,42 @@ # Rocky SSH Container +## Setup +### SSH Keys +Place your SSH public keys in the `docker_build/ssh-keys/` directory: +```bash +cp ~/.ssh/id_ed25519.pub docker_build/ssh-keys/ +``` +The container will automatically add all `.pub` files from this directory to `/root/.ssh/authorized_keys`. + +## Building Containers +### Base Development Container +```bash +# From the dev_env directory +podman build -t rocky_dev:latest -f docker_build/Dockerfile . +``` +### GPU-Enabled Container +The GPU container builds on top of the base container using multi-stage build: +```bash +# First build the base container (from dev_env directory) +podman build -t rocky_dev:latest -f docker_build/Dockerfile . +# Then build the GPU version +podman build -t rocky_dev_gpu:latest -f docker_build/Dockerfile.gpu . +``` + +## GPU Support +The GPU-enabled container includes: +- NVIDIA Container Toolkit for GPU access +- GPU test script at `/usr/local/bin/gpu-test.sh` +- Environment variables configured for NVIDIA GPU visibility +- Workspace directory at `/workspace` for GPU workloads + +### Running with GPU Support +```bash +# Run GPU-enabled container +podman run -it --device nvidia.com/gpu=all rocky_dev_gpu:latest +# Test GPU inside container +gpu-test.sh +nvidia-smi +``` ## Podman ```bash @@ -18,6 +56,15 @@ kubectl delete pod rocky-dev-0 kubectl scale statefulset rocky-dev --replicas=10 kubectl delete -f rocky-ssh-deployment.yaml ``` +### Kubernetes GPU Deployment +```bash +kubectl apply -f rocky-ssh-gpu-deployment.yaml +kubectl get pods -l app=rocky-dev-gpu -o wide +kubectl describe pod rocky-dev-gpu-0 | grep nvidia +kubectl exec -it rocky-dev-gpu-0 -- nvidia-smi +kubectl scale statefulset rocky-dev-gpu --replicas=4 +kubectl delete -f rocky-ssh-gpu-deployment.yaml +``` ## Local Registry ```bash @@ -30,11 +77,29 @@ podman push localhost:5000/rocky_dev:latest --tls-verify=false ```bash # Direct shell kubectl exec -it rocky-dev-0 -- /bin/bash - # SSH with agent forwarding (2 terminals) kubectl port-forward rocky-dev-0 2222:22 ssh-agent bash -c 'ssh-add ~/macm4-resident && ssh -A -p 2222 root@localhost' - # External kubectl port-forward --address 0.0.0.0 rocky-dev-0 9999:22 ``` + +## Features +### Development Tools +- C/C++ development: gcc, gcc-c++, make, cmake +- Python 3 with pip and development headers +- Rust toolchain with cargo tools (cargo-edit, bacon, evcxr_jupyter) +- Node.js v22 via nvm +- Claude Code CLI tool + +### System Utilities +- SSH server with key-based authentication +- tmux, vim, nano editors +- htop, bmon for system monitoring +- git, wget, tree, bat +- Network tools: nc, net-tools, wireguard-tools + +### GPU Computing (GPU version only) +- NVIDIA GPU support via container toolkit +- GPU test utilities +- Dedicated /workspace directory for ML/GPU workloads diff --git a/podman_launch_devenv.py b/podman_launch_devenv.py index 2473404..3d0b5b0 100755 --- a/podman_launch_devenv.py +++ b/podman_launch_devenv.py @@ -15,9 +15,9 @@ import subprocess, argparse, os, glob def run(cmd): return subprocess.run(cmd, shell=True, capture_output=True, text=True) def build(): - if not glob.glob("ssh-keys/*.pub"): os.makedirs("ssh-keys", exist_ok=True); open("ssh-keys/dummy.pub", "w").write("# dummy") + if not glob.glob("docker_build/ssh-keys/*.pub"): os.makedirs("docker_build/ssh-keys", exist_ok=True); open("docker_build/ssh-keys/dummy.pub", "w").write("# dummy") result = run("podman build -f docker_build/Dockerfile -t rocky_dev:latest .") - if os.path.exists("ssh-keys/dummy.pub"): os.remove("ssh-keys/dummy.pub") + if os.path.exists("docker_build/ssh-keys/dummy.pub"): os.remove("docker_build/ssh-keys/dummy.pub") return result.returncode == 0 def launch(): @@ -48,6 +48,6 @@ elif args.command == "run": else: print("❌ Image rocky_dev:latest not found") else: - print("Usage: python3 launcher.py {run|list|cleanup} [-p PORT]") + print("Usage: python3 podman_launch_devenv.py {run|list|cleanup} [-p PORT]") print("🐚 Shell: podman exec -it rocky_dev-<port> /bin/bash") print("💡 Tip: For direct shell without port forwarding, use: podman run -it rocky_dev:latest /bin/bash") diff --git a/rocky-ssh-deployment.yaml b/rocky-ssh-deployment.yaml index 0d30e59..bb6c37f 100644 --- a/rocky-ssh-deployment.yaml +++ b/rocky-ssh-deployment.yaml @@ -42,7 +42,7 @@ metadata: spec: clusterIP: None selector: - app: rocky-dev-deploy + app: rocky-dev ports: - port: 22 targetPort: 22 diff --git a/rocky-ssh-gpu-deployment.yaml b/rocky-ssh-gpu-deployment.yaml new file mode 100644 index 0000000..062ccae --- /dev/null +++ b/rocky-ssh-gpu-deployment.yaml @@ -0,0 +1,69 @@ +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: rocky-dev-gpu + labels: + app: rocky-dev-gpu +spec: + serviceName: rocky-dev-gpu-svc + replicas: 2 + selector: + matchLabels: + app: rocky-dev-gpu + template: + metadata: + labels: + app: rocky-dev-gpu + spec: + containers: + - name: rocky-dev-gpu + image: rocky_dev_gpu:latest + imagePullPolicy: IfNotPresent # Use local image + ports: + - containerPort: 22 + name: ssh + securityContext: + privileged: true + resources: + limits: + nvidia.com/gpu: 1 # Request 1 GPU per pod + requests: + nvidia.com/gpu: 1 + env: + - name: NVIDIA_VISIBLE_DEVICES + value: "all" + - name: NVIDIA_DRIVER_CAPABILITIES + value: "compute,utility" + volumeMounts: + - name: workspace + mountPath: /workspace + livenessProbe: + tcpSocket: + port: 22 + initialDelaySeconds: 30 + periodSeconds: 30 + readinessProbe: + tcpSocket: + port: 22 + initialDelaySeconds: 5 + periodSeconds: 10 + volumeClaimTemplates: + - metadata: + name: workspace + spec: + accessModes: [ "ReadWriteOnce" ] + resources: + requests: + storage: 10Gi +--- +apiVersion: v1 +kind: Service +metadata: + name: rocky-dev-gpu-svc +spec: + clusterIP: None + selector: + app: rocky-dev-gpu + ports: + - port: 22 + targetPort: 22
\ No newline at end of file diff --git a/tests/test_base_container.sh b/tests/test_base_container.sh new file mode 100755 index 0000000..b5115ec --- /dev/null +++ b/tests/test_base_container.sh @@ -0,0 +1,131 @@ +#!/bin/bash + +# Container Test Script for rocky_dev:latest +# This script tests all the functionality of the base container + +set -e + +CONTAINER_NAME="rocky_dev_test_$$" +IMAGE_NAME="rocky_dev:latest" +TEST_PORT=$(shuf -i 30000-40000 -n 1) + +# Cleanup function +cleanup() { + echo "" + echo "Cleaning up..." + podman stop $CONTAINER_NAME >/dev/null 2>&1 || true + podman rm $CONTAINER_NAME >/dev/null 2>&1 || true + echo "Container $CONTAINER_NAME removed" +} + +# Set trap to cleanup on exit +trap cleanup EXIT + +echo "=== Rocky Dev Container Test Suite ===" +echo "Container: $CONTAINER_NAME" +echo "Port: $TEST_PORT" +echo "" + +# Function to run commands in container +run_in_container() { + podman exec $CONTAINER_NAME bash -c "$1" +} + +# Function to check if command exists +check_command() { + local cmd=$1 + echo -n "Checking $cmd... " + if run_in_container "command -v $cmd" >/dev/null 2>&1; then + echo "✓" + return 0 + else + echo "✗" + return 1 + fi +} + +# Start container +echo "1. Starting container..." +podman run -d -p ${TEST_PORT}:22 --name $CONTAINER_NAME $IMAGE_NAME +sleep 5 + +echo "" +echo "2. Testing system packages..." +# Test core development tools +check_command gcc +check_command g++ +check_command make +check_command cmake +check_command git +check_command python3 +check_command pip3 + +echo "" +echo "3. Testing system utilities..." +# Test system utilities +check_command tmux +check_command vim +check_command nano +check_command tree +check_command htop +check_command bmon +check_command wget +check_command nc +check_command bat + +echo "" +echo "4. Testing SSH configuration..." +# Check SSH daemon +run_in_container "ps aux | grep sshd | grep -v grep" && echo "✓ SSH daemon running" || echo "✗ SSH daemon not running" + +# Check SSH config +run_in_container "grep -q 'PubkeyAuthentication yes' /etc/ssh/sshd_config" && echo "✓ PubkeyAuthentication enabled" || echo "✗ PubkeyAuthentication not enabled" +run_in_container "grep -q 'PermitRootLogin yes' /etc/ssh/sshd_config" && echo "✓ PermitRootLogin enabled" || echo "✗ PermitRootLogin not enabled" + +# Check SSH directory +run_in_container "test -d /root/.ssh && test -f /root/.ssh/authorized_keys" && echo "✓ SSH directory configured" || echo "✗ SSH directory not configured" + +echo "" +echo "5. Testing Rust installation..." +# Test Rust +run_in_container "source /root/.cargo/env && cargo --version" && echo "✓ Cargo installed" || echo "✗ Cargo not installed" +run_in_container "source /root/.cargo/env && rustc --version" && echo "✓ Rust compiler installed" || echo "✗ Rust compiler not installed" + +# Test Rust tools +echo "Checking Rust tools..." +for tool in cargo-clone cargo-add cargo-info bacon dust; do + run_in_container "source /root/.cargo/env && command -v $tool" >/dev/null 2>&1 && echo " ✓ $tool" || echo " ✗ $tool" +done +# Check evcxr_jupyter separately (it's a Jupyter kernel, not a CLI tool) +run_in_container "source /root/.cargo/env && ls ~/.cargo/bin/evcxr_jupyter" >/dev/null 2>&1 && echo " ✓ evcxr_jupyter (Rust Jupyter kernel)" || echo " ✗ evcxr_jupyter" + +echo "" +echo "6. Testing Node.js installation..." +# Test Node.js +run_in_container "source /root/.nvm/nvm.sh && node --version" && echo "✓ Node.js installed" || echo "✗ Node.js not installed" +run_in_container "source /root/.nvm/nvm.sh && npm --version" && echo "✓ npm installed" || echo "✗ npm not installed" + +# Test claude-code +run_in_container "source /root/.nvm/nvm.sh && claude --version" >/dev/null 2>&1 && echo "✓ claude-code installed" || echo "✗ claude-code not installed" + +echo "" +echo "7. Testing environment configuration..." +# Test bash configuration +run_in_container "grep -q 'LS_COLORS' /etc/bashrc" && echo "✓ LS_COLORS configured" || echo "✗ LS_COLORS not configured" +run_in_container "grep -q 'PS1=' /etc/bashrc" && echo "✓ Custom prompt configured" || echo "✗ Custom prompt not configured" + +echo "" +echo "8. Testing SSH connectivity..." +# Test SSH connection (this will fail without proper keys) +echo -n "Testing SSH port accessibility... " +nc -zv localhost $TEST_PORT 2>&1 | grep -q succeeded && echo "✓" || echo "✗" + +echo "" +echo "9. Testing file system..." +# Check working directory +run_in_container "pwd" | grep -q "/root" && echo "✓ Working directory is /root" || echo "✗ Working directory incorrect" + +echo "" +echo "=== Test Summary ===" +echo "All tests completed successfully!" +echo "Container will be automatically cleaned up."
\ No newline at end of file diff --git a/tests/test_gpu_container.sh b/tests/test_gpu_container.sh new file mode 100755 index 0000000..593f927 --- /dev/null +++ b/tests/test_gpu_container.sh @@ -0,0 +1,146 @@ +#!/bin/bash + +# Container Test Script for rocky_dev_gpu:latest +# This script tests all the functionality of the GPU-enabled container + +set -e + +CONTAINER_NAME="rocky_dev_gpu_test_$$" +IMAGE_NAME="rocky_dev_gpu:latest" +TEST_PORT=$(shuf -i 40000-50000 -n 1) + +# Cleanup function +cleanup() { + echo "" + echo "Cleaning up..." + podman stop $CONTAINER_NAME >/dev/null 2>&1 || true + podman rm $CONTAINER_NAME >/dev/null 2>&1 || true + echo "Container $CONTAINER_NAME removed" +} + +# Set trap to cleanup on exit +trap cleanup EXIT + +echo "=== Rocky Dev GPU Container Test Suite ===" +echo "Container: $CONTAINER_NAME" +echo "Port: $TEST_PORT" +echo "" + +# Function to run commands in container +run_in_container() { + podman exec $CONTAINER_NAME bash -c "$1" +} + +# Function to check if command exists +check_command() { + local cmd=$1 + echo -n "Checking $cmd... " + if run_in_container "command -v $cmd" >/dev/null 2>&1; then + echo "✓" + return 0 + else + echo "✗" + return 1 + fi +} + +# Start container with GPU support +echo "1. Starting GPU container..." +podman run -d -p ${TEST_PORT}:22 --device nvidia.com/gpu=all --name $CONTAINER_NAME $IMAGE_NAME +sleep 5 + +echo "" +echo "2. Testing base container functionality..." +echo "(Inherited from rocky_dev:latest)" + +# Quick check of base tools +echo -n "Development tools: " +for cmd in gcc g++ make cmake git python3; do + run_in_container "command -v $cmd" >/dev/null 2>&1 || { echo "✗ Missing $cmd"; exit 1; } +done +echo "✓" + +echo -n "Rust toolchain: " +run_in_container "source /root/.cargo/env && cargo --version" >/dev/null 2>&1 && echo "✓" || echo "✗" + +echo -n "Node.js: " +run_in_container "source /root/.nvm/nvm.sh && node --version" >/dev/null 2>&1 && echo "✓" || echo "✗" + +echo "" +echo "3. Testing GPU-specific packages..." +# Check for GPU utilities +check_command lspci +check_command nvidia-smi || echo " (nvidia-smi requires actual GPU hardware)" + +# Check for kernel packages +echo -n "Checking kernel headers... " +run_in_container "rpm -q kernel-headers" >/dev/null 2>&1 && echo "✓" || echo "✗" + +echo -n "Checking kernel-devel... " +run_in_container "rpm -q kernel-devel" >/dev/null 2>&1 && echo "✓" || echo "✗" + +echo -n "Checking pciutils... " +run_in_container "rpm -q pciutils" >/dev/null 2>&1 && echo "✓" || echo "✗" + +echo "" +echo "4. Testing NVIDIA container toolkit..." +echo -n "Checking nvidia-container-toolkit... " +run_in_container "rpm -q nvidia-container-toolkit" >/dev/null 2>&1 && echo "✓" || echo "✗" + +echo "" +echo "5. Testing GPU environment variables..." +# Check environment variables +echo -n "NVIDIA_VISIBLE_DEVICES... " +run_in_container "echo \$NVIDIA_VISIBLE_DEVICES" | grep -q "all" && echo "✓ Set to 'all'" || echo "✗ Not set correctly" + +echo -n "NVIDIA_DRIVER_CAPABILITIES... " +run_in_container "echo \$NVIDIA_DRIVER_CAPABILITIES" | grep -q "compute,utility" && echo "✓ Set to 'compute,utility'" || echo "✗ Not set correctly" + +echo "" +echo "6. Testing GPU test script..." +# Check if gpu-test.sh exists and is executable +echo -n "Checking /usr/local/bin/gpu-test.sh... " +run_in_container "test -x /usr/local/bin/gpu-test.sh" && echo "✓ Exists and executable" || echo "✗ Not found or not executable" + +# Run the GPU test script +echo "" +echo "Running GPU test script:" +echo "------------------------" +run_in_container "/usr/local/bin/gpu-test.sh" || echo "Note: Some GPU tests may fail without actual GPU hardware" +echo "------------------------" + +echo "" +echo "7. Testing workspace directory..." +# Check workspace directory +echo -n "Checking /workspace directory... " +run_in_container "test -d /workspace" && echo "✓ Exists" || echo "✗ Not found" + +echo "" +echo "8. Testing PCI device detection..." +# Try to detect any NVIDIA devices +echo "PCI devices (filtered for NVIDIA/GPU):" +run_in_container "lspci 2>/dev/null | grep -iE '(nvidia|vga|3d|display)' || echo ' No GPU devices detected (this is normal without GPU hardware)'" + +echo "" +echo "9. Testing container GPU device access..." +# Check if container has GPU device access +echo -n "Checking /dev/nvidia* devices... " +if run_in_container "ls /dev/nvidia* 2>/dev/null" >/dev/null 2>&1; then + echo "✓ GPU devices found" + run_in_container "ls -la /dev/nvidia*" +else + echo "✗ No GPU devices (normal without GPU hardware)" +fi + +echo "" +echo "=== Test Summary ===" +echo "GPU Support Status:" +if run_in_container "command -v nvidia-smi && nvidia-smi" >/dev/null 2>&1; then + echo " ✓ Full GPU support detected" +else + echo " ⚠ GPU tools installed but no GPU hardware detected" + echo " This is normal when running without NVIDIA GPU" +fi +echo "" +echo "All tests completed successfully!" +echo "Container will be automatically cleaned up."
\ No newline at end of file |
