view gcp_batch_netcat.py @ 11:fe0bf22037a5 draft

planemo upload for repository https://github.com/afgane/gcp_batch_netcat commit f730cbb207e028a5d4fd982fe65ece7345af4879
author enis
date Thu, 14 Aug 2025 16:39:36 +0000
parents cdfa2e1a7ef4
children
line wrap: on
line source

import argparse
import json
import logging
import os
import sys
import uuid
from google.cloud import batch_v1

# Configure logging to go to stdout instead of stderr to avoid Galaxy marking job as failed
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    stream=sys.stdout
)
logger = logging.getLogger(__name__)



def determine_test_target(args):
    """Determine the target host and port based on test type"""

    if args.test_type == 'nfs':
        # NFS server address is required
        if not args.nfs_address:
            raise ValueError("NFS server address is required. Please provide --nfs_address parameter with the LoadBalancer external IP.")

        nfs_address = args.nfs_address
        logger.info(f"Using provided NFS address: {nfs_address}")
        return nfs_address, 2049

    else:
        raise ValueError(f"Unsupported test type: {args.test_type}")

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--nfs_address', required=True, help='NFS server LoadBalancer external IP address (required)')
    parser.add_argument('--output', required=True)
    parser.add_argument('--project', required=False, help='GCP Project ID (if not provided, will be extracted from service account key)')
    parser.add_argument('--region', required=True)
    parser.add_argument('--network', default='default', help='GCP Network name')
    parser.add_argument('--subnet', default='default', help='GCP Subnet name')
    parser.add_argument('--service_account_key', required=True)
    args = parser.parse_args()

    # Default to NFS test type since that's what this tool is for
    args.test_type = 'nfs'

    # Set up authentication using the service account key
    os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = args.service_account_key
    logger.info(f"Authentication configured with service account: {args.service_account_key}")

    # Extract GCP project ID from service account key if not provided
    if args.project:
        project_id = args.project
        logger.info(f"Using provided project ID: {project_id}")
    else:
        try:
            with open(args.service_account_key, 'r') as f:
                service_account_data = json.load(f)
            project_id = service_account_data.get('project_id')
            if not project_id:
                raise ValueError("project_id not found in service account key file")
            logger.info(f"Extracted project ID from service account key: {project_id}")
        except Exception as e:
            logger.error(f"Failed to extract project ID from service account key: {e}")
            raise

    # Determine target host and port based on test type
    try:
        target_host, target_port = determine_test_target(args)
        logger.info(f"Target determined: {target_host}:{target_port}")
    except Exception as e:
        logger.error(f"Failed to determine target: {e}")
        raise

    job_name = f'netcat-job-{uuid.uuid4()}'
    logger.info(f"Generated job name: {job_name}")

    # Create Batch client
    logger.info("Creating Batch client...")
    client = batch_v1.BatchServiceClient()
    logger.info("Batch client created successfully")

    # Create a comprehensive test script
    test_script = f'''#!/bin/bash
set -e
echo "=== GCP Batch NFS Connectivity Test ==="
echo "Target: {target_host}:{target_port}"
echo "Timestamp: $(date)"
echo "Container hostname: $(hostname)"
echo "Host VM Image: galaxy-k8s-boot-v2025-08-12"
echo "Container Image: afgane/gcp-batch-netcat:0.3.0"
echo ""

# Basic system info
echo "=== System Information ==="
echo "OS Release:"
cat /etc/os-release | head -5 2>/dev/null || echo "OS release info not available"
echo "Kernel version:"
uname -r
echo "Architecture:"
uname -m
echo ""

# Basic network info
echo "=== Network Information ==="
echo "Container IP addresses:"
hostname -I
echo "Default route:"
ip route | grep default || echo "No default route found"
echo ""

# DNS configuration
echo "=== DNS Configuration ==="
echo "DNS servers:"
cat /etc/resolv.conf | grep nameserver || echo "No nameservers found"
echo ""

# Test DNS resolution of target
echo "=== DNS Resolution Test ==="
echo "Resolving {target_host}:"
nslookup {target_host} || {{
    echo "DNS resolution failed for {target_host}"
    echo "Trying with Google DNS (8.8.8.8):"
    nslookup {target_host} 8.8.8.8 || echo "DNS resolution failed even with Google DNS"
}}
echo ""

# Basic connectivity test
echo "=== Primary NFS Connectivity Test ==="
echo "Testing connection to NFS server {target_host}:{target_port}..."
timeout 30 nc -z -v -w 10 {target_host} {target_port}
nc_result=$?
echo "Netcat result: $nc_result"
echo ""

# NFS client capabilities
echo "=== NFS Client Information ==="
echo "NFS client version:"
/sbin/mount.nfs -V 2>/dev/null || echo "mount.nfs not available"
echo "RPC services:"
rpcinfo -p 2>/dev/null || echo "rpcinfo not available"
echo ""

# Additional connectivity tests
echo "=== Additional Connectivity Tests ==="
echo "Testing external connectivity (Google DNS 8.8.8.8:53):"
timeout 10 nc -z -v -w 5 8.8.8.8 53 && echo "✓ External DNS reachable" || echo "✗ External DNS unreachable"

echo ""
echo "=== Network Troubleshooting ==="
echo "Route table:"
ip route
echo ""

# NFS Mount Test - Check if Batch mounted it for us
echo "=== NFS Mount Test (via Batch Volume) ==="
NFS_MOUNT_POINT="/mnt/nfs"
mount_result=1

echo "Checking if NFS is mounted by Batch at $NFS_MOUNT_POINT..."
if [ -d "$NFS_MOUNT_POINT" ]; then
    echo "✓ NFS mount point exists"

    # Check if it's actually mounted
    if mount | grep "$NFS_MOUNT_POINT"; then
        mount_result=0
        echo "✓ NFS mounted by Batch successfully!"

        echo ""
        echo "=== NFS Share Contents ==="
        echo "Long listing of NFS share:"
        ls -la "$NFS_MOUNT_POINT" 2>/dev/null || echo "Could not list directory contents"

        echo ""
        echo "Disk usage of NFS share:"
        df -h "$NFS_MOUNT_POINT" 2>/dev/null || echo "Could not get disk usage"

        # Look for export subdirectories
        echo ""
        echo "=== Looking for export directories ==="
        if [ -d "$NFS_MOUNT_POINT/export" ]; then
            echo "✓ Found: export directory"
            ls -la "$NFS_MOUNT_POINT/export" | head -10 2>/dev/null || echo "Could not list export contents"

            # Look for PVC subdirectories
            echo "Looking for PVC directories in export..."
            find "$NFS_MOUNT_POINT/export" -name "pvc-*" -type d | head -5 2>/dev/null || echo "No PVC directories found"
        else
            echo "✗ No export directory found"
        fi

        # Try to find common Galaxy directories
        echo ""
        echo "=== Looking for Galaxy directories ==="

        # First check if they exist directly in the NFS root
        galaxy_dirs_in_root=0
        for dir in "jobs_directory" "shed_tools" "objects" "tools" "cache" "config"; do
            if [ -d "$NFS_MOUNT_POINT/$dir" ]; then
                echo "✓ Found in root: $dir"
                ls -la "$NFS_MOUNT_POINT/$dir" | head -5
                galaxy_dirs_in_root=$((galaxy_dirs_in_root + 1))
            fi
        done

        if [ $galaxy_dirs_in_root -eq 0 ]; then
            echo "✗ No Galaxy directories found in NFS root"
        else
            echo "✓ Found $galaxy_dirs_in_root Galaxy directories in NFS root"
        fi

        # Then check inside any PVC directories under export
        if [ -d "$NFS_MOUNT_POINT/export" ]; then
            echo ""
            echo "=== Checking PVC directories for Galaxy structure ==="

            # Find all PVC directories
            pvc_count=0
            for pvc_dir in $(find "$NFS_MOUNT_POINT/export" -name "pvc-*" -type d 2>/dev/null); do
                pvc_count=$((pvc_count + 1))
                echo ""
                echo "Checking PVC ($pvc_count): $(basename $pvc_dir)"
                echo "  Full path: $pvc_dir"

                # Show directory listing of PVC
                echo "  Contents:"
                ls -la "$pvc_dir" | head -10 | sed 's/^/    /'

                # Check for Galaxy directories inside this PVC
                galaxy_dirs_found=0
                for dir in "jobs_directory" "shed_tools" "objects" "tools" "cache" "config" "deps" "tmp"; do
                    if [ -d "$pvc_dir/$dir" ]; then
                        echo "  ✓ Found Galaxy directory: $dir"
                        # Show a sample of contents
                        ls -la "$pvc_dir/$dir" 2>/dev/null | head -3 | sed 's/^/      /'
                        galaxy_dirs_found=$((galaxy_dirs_found + 1))
                    fi
                done

                # Check for Galaxy-specific files
                galaxy_files_found=0
                for file in "galaxy.yml" "universe_wsgi.ini" "config/galaxy.yml" "results.sqlite" "celery-beat-schedule"; do
                    if [ -f "$pvc_dir/$file" ]; then
                        echo "  ✓ Found Galaxy file: $file"
                        galaxy_files_found=$((galaxy_files_found + 1))
                    fi
                done

                total_indicators=$((galaxy_dirs_found + galaxy_files_found))
                if [ $total_indicators -gt 0 ]; then
                    echo "  🎯 This PVC contains $galaxy_dirs_found Galaxy directories and $galaxy_files_found Galaxy files"

                    # Test write access
                    test_file="$pvc_dir/.batch_test_file_$(date +%s)"
                    if echo "test" > "$test_file" 2>/dev/null; then
                        echo "  ✓ Write access confirmed"
                        rm -f "$test_file" 2>/dev/null
                    else
                        echo "  ✗ No write access"
                    fi

                    # Test specific Galaxy directories access
                    if [ -d "$pvc_dir/jobs_directory" ]; then
                        echo "  � Jobs directory details:"
                        du -sh "$pvc_dir/jobs_directory" 2>/dev/null | sed 's/^/      /' || echo "      Could not get size"
                        job_count=$(find "$pvc_dir/jobs_directory" -mindepth 1 -maxdepth 1 -type d 2>/dev/null | wc -l)
                        echo "      Job subdirectories: $job_count"
                    fi

                    if [ -d "$pvc_dir/shed_tools" ]; then
                        echo "  🔧 Shed tools directory details:"
                        du -sh "$pvc_dir/shed_tools" 2>/dev/null | sed 's/^/      /' || echo "      Could not get size"
                        tool_count=$(find "$pvc_dir/shed_tools" -name "*.py" -o -name "*.xml" 2>/dev/null | wc -l)
                        echo "      Tool files (py/xml): $tool_count"
                    fi
                else
                    echo "  ✗ No Galaxy directories or files found in this PVC"
                fi
            done

            if [ $pvc_count -eq 0 ]; then
                echo "✗ No PVC directories found in export"
            else
                echo ""
                echo "📊 Summary: Found $pvc_count PVC directories in export"
            fi
        else
            echo ""
            echo "✗ No export directory found in NFS mount"
        fi
    else
        echo "✗ NFS mount point exists but is not mounted"
        echo "This suggests Batch volume configuration may be incorrect"
    fi
else
    echo "✗ NFS mount point $NFS_MOUNT_POINT does not exist"
    echo "This suggests Batch volume was not configured"
fi

# CVMFS Mount Test
echo ""
echo "=== CVMFS Access Test ==="
echo "Checking if CVMFS is bind-mounted from host VM..."
if [ -d "/cvmfs" ]; then
    echo "✓ /cvmfs directory exists (bind-mounted from host)"
    ls -la /cvmfs 2>/dev/null || echo "Could not list /cvmfs contents"

    echo ""
    echo "Checking for Galaxy CVMFS repository..."
    cvmfs_result=1
    if [ -d "/cvmfs/data.galaxyproject.org" ]; then
        cvmfs_result=0
        echo "✓ Galaxy CVMFS repository accessible!"

        echo ""
        echo "=== CVMFS Repository Contents ==="
        echo "Long listing of CVMFS repository root:"
        ls -la "/cvmfs/data.galaxyproject.org" 2>/dev/null | head -10 || echo "Could not list directory contents"

        echo ""
        echo "Listing Galaxy reference data directories:"
        for dir in "byhand" "managed"; do
            if [ -d "/cvmfs/data.galaxyproject.org/$dir" ]; then
                echo "✓ Found CVMFS directory: $dir"
                ls "/cvmfs/data.galaxyproject.org/$dir" | head -5 2>/dev/null || echo "Could not list contents"
            else
                echo "✗ Not found: $dir"
            fi
        done

        echo ""
        echo "=== CVMFS File Access Test ==="
        echo "Testing access to specific Galaxy reference file..."
        echo "File: /cvmfs/data.galaxyproject.org/byhand/Arabidopsis_thaliana_TAIR10/seq/Arabidopsis_thaliana_TAIR10.fa.fai"

        CVMFS_TEST_FILE="/cvmfs/data.galaxyproject.org/byhand/Arabidopsis_thaliana_TAIR10/seq/Arabidopsis_thaliana_TAIR10.fa.fai"
        if [ -f "$CVMFS_TEST_FILE" ]; then
            echo "✓ File exists, reading first 10 lines:"
            head "$CVMFS_TEST_FILE" 2>/dev/null || echo "Could not read file contents"
        else
            echo "✗ File not found"
            echo "Checking if parent directories exist:"
            [ -d "/cvmfs/data.galaxyproject.org/byhand/Arabidopsis_thaliana_TAIR10" ] && echo "  ✓ Arabidopsis_thaliana_TAIR10 directory exists" || echo "  ✗ Arabidopsis_thaliana_TAIR10 directory missing"
            [ -d "/cvmfs/data.galaxyproject.org/byhand/Arabidopsis_thaliana_TAIR10/seq" ] && echo "  ✓ seq directory exists" || echo "  ✗ seq directory missing"
        fi

        echo ""
        echo "CVMFS mount information from host:"
        mount | grep cvmfs || echo "CVMFS mount info not visible from container"
    else
        echo "✗ Galaxy CVMFS repository not found at /cvmfs/data.galaxyproject.org"
        echo "This may indicate:"
        echo "- CVMFS client not running on host VM"
        echo "- Repository not mounted on host"
        echo "- Bind mount not properly configured"
    fi
else
    echo "✗ /cvmfs directory not found"
    echo "This indicates the bind mount from host VM failed"
    echo "Expected: /cvmfs from host VM bind-mounted into container"
fi


echo ""
echo "=== Final Result ==="
if [ $nc_result -eq 0 ] && [ $mount_result -eq 0 ]; then
    echo "✓ SUCCESS: Both network connectivity and NFS mount to {target_host}:{target_port} successful"
    if [ $cvmfs_result -eq 0 ]; then
        echo "✓ BONUS: CVMFS repository mount also successful"
    else
        echo "ℹ INFO: CVMFS mount failed (may not be available in this image)"
    fi
    exit 0
elif [ $nc_result -eq 0 ]; then
    echo "⚠ PARTIAL SUCCESS: Network connectivity successful but NFS mount failed"
    echo "Network connection to {target_host}:{target_port} works, but NFS mounting failed."
    echo "This suggests:"
    echo "- NFS server is reachable but may not be properly configured"
    echo "- NFS export permissions may be incorrect"
    echo "- Firewall may allow port 2049 but block other NFS ports (111, 20048)"
    if [ $cvmfs_result -eq 0 ]; then
        echo "✓ CVMFS repository mount was successful"
    fi
    exit 1
else
    echo "✗ FAILED: Network connectivity to NFS server {target_host}:{target_port} failed"
    echo "This suggests a network connectivity issue between GCP Batch and the NFS server."
    echo "Common causes:"
    echo "- Firewall rules blocking NFS traffic (port 2049)"
    echo "- NFS service not accessible from external networks (only ClusterIP)"
    echo "- NFS server not properly exposed via LoadBalancer"
    echo ""
    echo "Solutions:"
    echo "- Ensure NFS service has type LoadBalancer with external IP"
    echo "- Check GCP firewall rules allow traffic from Batch subnet to NFS"
    echo "- Verify the IP address is the LoadBalancer external IP, not ClusterIP"
    if [ $cvmfs_result -eq 0 ]; then
        echo ""
        echo "✓ CVMFS repository mount was successful (good network connectivity to external services)"
    fi
    exit 1
fi
'''

    # Define the job using the Python client library objects
    logger.info("Building job specification...")

    # Escape the test script for use in docker command (outside f-string to avoid backslash issues)
    escaped_test_script = test_script.replace("'", "'\"'\"'")

    # Create a host script that triggers CVMFS mount and then runs the container
    host_script = f'''#!/bin/bash
set -e
echo "=== Pre-Container Host Script ==="
echo "Timestamp: $(date)"
echo "Host VM Image: galaxy-k8s-boot-v2025-08-12"
echo "Running on host before container starts..."
echo ""

echo "=== Triggering CVMFS Mount ==="
echo "Checking CVMFS autofs status:"
mount | grep cvmfs || echo "No CVMFS mounts yet"

echo ""
echo "Triggering CVMFS mount by accessing repository:"
ls /cvmfs/data.galaxyproject.org/ || echo "Could not access CVMFS repository"

echo ""
echo "After access - checking CVMFS mounts:"
mount | grep cvmfs || echo "Still no CVMFS mounts visible"

echo ""
echo "Testing specific file access from host:"
if [ -f "/cvmfs/data.galaxyproject.org/byhand/Arabidopsis_thaliana_TAIR10/seq/Arabidopsis_thaliana_TAIR10.fa.fai" ]; then
    echo "✓ CVMFS file accessible from host"
    head -3 "/cvmfs/data.galaxyproject.org/byhand/Arabidopsis_thaliana_TAIR10/seq/Arabidopsis_thaliana_TAIR10.fa.fai"
else
    echo "✗ CVMFS file not accessible from host"
fi

echo ""
echo "=== Starting Container ==="
echo "Running container with bind-mounted CVMFS and NFS..."

# Run the container with the test script and volume mounts
docker run --rm \\
    -v /cvmfs:/cvmfs:ro \\
    -v /mnt/nfs:/mnt/nfs:rw \\
    afgane/gcp-batch-netcat:0.3.0 \\
    /bin/bash -c '{escaped_test_script}'
'''

    runnable = batch_v1.Runnable()
    runnable.script = batch_v1.Runnable.Script()
    runnable.script.text = host_script
    logger.debug(f"Host script configured to trigger CVMFS mount and run container")

    task = batch_v1.TaskSpec()
    task.runnables = [runnable]
    task.compute_resource = batch_v1.ComputeResource()
    task.compute_resource.cpu_milli = 1000
    task.compute_resource.memory_mib = 1024
    logger.debug(f"Compute resources: CPU={task.compute_resource.cpu_milli}m, Memory={task.compute_resource.memory_mib}MiB")

    # Configure NFS volume in the task
    volume = batch_v1.Volume()
    volume.nfs = batch_v1.NFS()
    volume.nfs.server = target_host
    volume.nfs.remote_path = "/"  # Root of the NFS export
    volume.mount_path = "/mnt/nfs"

    task.volumes = [volume]
    logger.debug(f"NFS volume configured: {target_host}:/ -> /mnt/nfs")

    task_group = batch_v1.TaskGroup()
    task_group.task_count = 1
    task_group.parallelism = 1
    task_group.task_spec = task
    logger.debug(f"Task group: count={task_group.task_count}, parallelism={task_group.parallelism}")

    # Network configuration: Batch job should run in the same network as the NFS server
    network_interface = batch_v1.AllocationPolicy.NetworkInterface()
    network_interface.network = f"global/networks/{args.network}"
    network_interface.subnetwork = f"regions/{args.region}/subnetworks/{args.subnet}"
    logger.debug(f"Network: {network_interface.network}")
    logger.debug(f"Subnet: {network_interface.subnetwork}")

    network_policy = batch_v1.AllocationPolicy.NetworkPolicy()
    network_policy.network_interfaces = [network_interface]

    # Instance policy with custom VM image
    instance_policy = batch_v1.AllocationPolicy.InstancePolicy()
    instance_policy.machine_type = "e2-medium"  # Specify machine type for custom image
    instance_policy.boot_disk = batch_v1.AllocationPolicy.Disk()
    instance_policy.boot_disk.image = f"projects/{project_id}/global/images/galaxy-k8s-boot-v2025-08-12"
    instance_policy.boot_disk.size_gb = 99
    logger.debug(f"Using custom VM image: {instance_policy.boot_disk.image}")

    # Wrap the instance policy in InstancePolicyOrTemplate
    instance_policy_or_template = batch_v1.AllocationPolicy.InstancePolicyOrTemplate()
    instance_policy_or_template.policy = instance_policy

    allocation_policy = batch_v1.AllocationPolicy()
    allocation_policy.network = network_policy
    allocation_policy.instances = [instance_policy_or_template]

    job = batch_v1.Job()
    job.task_groups = [task_group]
    job.allocation_policy = allocation_policy
    job.logs_policy = batch_v1.LogsPolicy()
    job.logs_policy.destination = batch_v1.LogsPolicy.Destination.CLOUD_LOGGING
    logger.info("Job specification built successfully")

    create_request = batch_v1.CreateJobRequest()
    create_request.parent = f"projects/{project_id}/locations/{args.region}"
    create_request.job_id = job_name
    create_request.job = job
    logger.debug(f"Create request parent: {create_request.parent}")
    logger.debug(f"Create request job_id: {create_request.job_id}")

    logger.info(f"Submitting job with name: {job_name}")
    logger.info(f"Target project: {project_id}")
    logger.info(f"Target Batch region: {args.region}")
    logger.info(f"Test target: {target_host}:{target_port}")

    # Proceed with job submission
    try:
        logger.info("Calling client.create_job()...")
        job_response = client.create_job(request=create_request)
        logger.info("Job submitted successfully!")
        logger.info(f"Job name: {job_response.name}")
        logger.info(f"Job UID: {job_response.uid}")

        with open(args.output, 'w') as f:
            f.write("Job submitted successfully using Python client.\n")
            f.write(f"Job name: {job_name}\n")
            f.write(f"Job response name: {job_response.name}\n")
            f.write(f"Job UID: {job_response.uid}\n")
            f.write(f"Project: {project_id}\n")
            f.write(f"Region: {args.region}\n")
            f.write(f"NFS Target: {target_host}:{target_port}\n")
            f.write(f"\nTo view job logs, run:\n")
            f.write(f"gcloud logging read 'resource.type=gce_instance AND resource.labels.instance_id={job_name}' --project={project_id}\n")

    except Exception as e:
        logger.error(f"Error submitting job: {type(e).__name__}: {e}")
        logger.error(f"Error details: {str(e)}")
        import traceback
        logger.error("Traceback:", exc_info=True)

        with open(args.output, 'w') as f:
            f.write(f"Error submitting job: {type(e).__name__}: {e}\n")
            f.write(f"Error details: {str(e)}\n")
            f.write(f"Job name: {job_name}\n")
            f.write(f"Project: {project_id}\n")
            f.write(f"Region: {args.region}\n")
            f.write(f"NFS Target: {target_host}:{target_port}\n")
            f.write(f"Traceback:\n")
            f.write(traceback.format_exc())

if __name__ == '__main__':
    main()