diff gcp_batch_netcat.py @ 5:b2ce158b4f22 draft

planemo upload commit ece227052d14d755b0d0b07a827152b2e98fb94b
author enis
date Thu, 24 Jul 2025 21:41:18 +0000
parents 2ff4a39ea41b
children d25792770df8
line wrap: on
line diff
--- a/gcp_batch_netcat.py	Tue Jul 22 14:47:47 2025 +0000
+++ b/gcp_batch_netcat.py	Thu Jul 24 21:41:18 2025 +0000
@@ -3,12 +3,10 @@
 import logging
 import os
 import sys
-# import time
 import uuid
 from google.cloud import batch_v1
 
 # Configure logging to go to stdout instead of stderr to avoid Galaxy marking job as failed
-import sys
 logging.basicConfig(
     level=logging.INFO,
     format='%(asctime)s - %(levelname)s - %(message)s',
@@ -16,6 +14,114 @@
 )
 logger = logging.getLogger(__name__)
 
+def determine_test_target(args):
+    """Determine the target host and port based on test type"""
+
+    if args.test_type == 'custom':
+        if not args.custom_host:
+            raise ValueError("custom_host is required when test_type is 'custom'")
+        return args.custom_host, args.custom_port
+
+    elif args.test_type == 'nfs':
+        # Extract NFS server address if not provided
+        if args.nfs_address:
+            nfs_address = args.nfs_address
+            logger.info(f"Using provided NFS address: {nfs_address}")
+        else:
+            try:
+                # Try to detect NFS server from /galaxy/server/database/ mount
+                import subprocess
+                result = subprocess.run(['mount'], capture_output=True, text=True)
+                nfs_address = None
+
+                for line in result.stdout.split('\n'):
+                    if '/galaxy/server/database' in line and ':' in line:
+                        # Look for NFS mount pattern: server:/path on /galaxy/server/database
+                        parts = line.split()
+                        for part in parts:
+                            if ':' in part and part.count(':') == 1:
+                                nfs_address = part.split(':')[0]
+                                break
+                        if nfs_address:
+                            logger.info(f"Detected NFS address from mount: {nfs_address}")
+                            break
+
+                if not nfs_address:
+                    # Fallback: try to parse /proc/mounts
+                    try:
+                        with open('/proc/mounts', 'r') as f:
+                            for line in f:
+                                if '/galaxy/server/database' in line and ':' in line:
+                                    parts = line.split()
+                                    if len(parts) > 0 and ':' in parts[0]:
+                                        nfs_address = parts[0].split(':')[0]
+                                        logger.info(f"Detected NFS address from /proc/mounts: {nfs_address}")
+                                        break
+                    except:
+                        pass
+
+                if not nfs_address:
+                    raise ValueError("Could not auto-detect NFS server address from /galaxy/server/database/ mount")
+
+                logger.info(f"Auto-detected NFS address from mount: {nfs_address}")
+            except Exception as e:
+                logger.error(f"Failed to auto-detect NFS address: {e}")
+                raise
+        return nfs_address, 2049
+
+    elif args.test_type == 'galaxy_web':
+        # Try to detect Galaxy web service
+        try:
+            import subprocess
+            result = subprocess.run(['kubectl', 'get', 'svc', '-o', 'json'], capture_output=True, text=True)
+            if result.returncode == 0:
+                services = json.loads(result.stdout)
+                for item in services.get('items', []):
+                    name = item.get('metadata', {}).get('name', '')
+                    if 'galaxy' in name.lower() and ('web' in name.lower() or 'nginx' in name.lower()):
+                        # Found a Galaxy web service
+                        spec = item.get('spec', {})
+                        if spec.get('type') == 'LoadBalancer':
+                            ingress = item.get('status', {}).get('loadBalancer', {}).get('ingress', [])
+                            if ingress:
+                                ip = ingress[0].get('ip')
+                                if ip:
+                                    port = 80
+                                    for port_spec in spec.get('ports', []):
+                                        if port_spec.get('port'):
+                                            port = port_spec['port']
+                                            break
+                                    logger.info(f"Found Galaxy web service LoadBalancer: {ip}:{port}")
+                                    return ip, port
+                        # Fallback to ClusterIP
+                        cluster_ip = spec.get('clusterIP')
+                        if cluster_ip and cluster_ip != 'None':
+                            port = 80
+                            for port_spec in spec.get('ports', []):
+                                if port_spec.get('port'):
+                                    port = port_spec['port']
+                                    break
+                            logger.info(f"Found Galaxy web service ClusterIP: {cluster_ip}:{port}")
+                            return cluster_ip, port
+        except Exception as e:
+            logger.warning(f"Could not auto-detect Galaxy web service: {e}")
+
+        # Fallback: try common Galaxy service names
+        common_hosts = ['galaxy-web', 'galaxy-nginx', 'galaxy']
+        logger.info(f"Trying common Galaxy service name: {common_hosts[0]}")
+        return common_hosts[0], 80
+
+    elif args.test_type == 'k8s_dns':
+        # Test Kubernetes DNS resolution
+        return 'kubernetes.default.svc.cluster.local', 443
+
+    elif args.test_type == 'google_dns':
+        # Test external connectivity
+        return '8.8.8.8', 53
+
+    else:
+        raise ValueError(f"Unsupported test type: {args.test_type}")
+
 def main():
     parser = argparse.ArgumentParser()
     parser.add_argument('--nfs_address', required=False, help='NFS server address (if not provided, will be auto-detected from /galaxy/server/database/ mount)')
@@ -25,6 +131,10 @@
     parser.add_argument('--network', default='default', help='GCP Network name')
     parser.add_argument('--subnet', default='default', help='GCP Subnet name')
     parser.add_argument('--service_account_key', required=True)
+    parser.add_argument('--test_type', default='nfs', choices=['nfs', 'galaxy_web', 'k8s_dns', 'google_dns', 'custom'],
+                       help='Type of connectivity test to perform')
+    parser.add_argument('--custom_host', required=False, help='Custom host to test (required if test_type is custom)')
+    parser.add_argument('--custom_port', type=int, default=80, help='Custom port to test (default: 80)')
     args = parser.parse_args()
 
     # Set up authentication using the service account key
@@ -47,52 +157,13 @@
             logger.error(f"Failed to extract project ID from service account key: {e}")
             raise
 
-    # Extract NFS server address if not provided
-    if args.nfs_address:
-        nfs_address = args.nfs_address
-        logger.info(f"Using provided NFS address: {nfs_address}")
-    else:
-        try:
-            # Try to detect NFS server from /galaxy/server/database/ mount
-            import subprocess
-            result = subprocess.run(['mount'], capture_output=True, text=True)
-            nfs_address = None
-
-            for line in result.stdout.split('\n'):
-                if '/galaxy/server/database' in line and ':' in line:
-                    # Look for NFS mount pattern: server:/path on /galaxy/server/database
-                    parts = line.split()
-                    for part in parts:
-                        if ':' in part and part.count(':') == 1:
-                            nfs_address = part.split(':')[0]
-                            break
-                    if nfs_address:
-                        logger.info(f"Detected NFS address from mount: {nfs_address}")
-                        break
-
-            if not nfs_address:
-                # Fallback: try to parse /proc/mounts
-                try:
-                    with open('/proc/mounts', 'r') as f:
-                        for line in f:
-                            if '/galaxy/server/database' in line and ':' in line:
-                                parts = line.split()
-                                if len(parts) > 0 and ':' in parts[0]:
-                                    nfs_address = parts[0].split(':')[0]
-                                    logger.info(f"Detected NFS address from /proc/mounts: {nfs_address}")
-                                    break
-                except:
-                    pass
-
-            if not nfs_address:
-                raise ValueError("Could not auto-detect NFS server address from /galaxy/server/database/ mount")
-
-            logger.info(f"Auto-detected NFS address from mount: {nfs_address}")
-        except Exception as e:
-            logger.error(f"Failed to auto-detect NFS address: {e}")
-            raise
-
-    # time.sleep(10000)
+    # Determine target host and port based on test type
+    try:
+        target_host, target_port = determine_test_target(args)
+        logger.info(f"Target determined: {target_host}:{target_port}")
+    except Exception as e:
+        logger.error(f"Failed to determine target: {e}")
+        raise
 
     job_name = f'netcat-job-{uuid.uuid4()}'
     logger.info(f"Generated job name: {job_name}")
@@ -107,9 +178,84 @@
     runnable = batch_v1.Runnable()
     runnable.container = batch_v1.Runnable.Container()
     runnable.container.image_uri = "afgane/gcp-batch-netcat:0.2.0"
-    runnable.container.entrypoint = "/usr/bin/nc"
-    runnable.container.commands = ["-z", "-v", nfs_address, "2049"]
-    logger.debug(f"Container config: image={runnable.container.image_uri}, entrypoint={runnable.container.entrypoint}, commands={runnable.container.commands}")
+
+    # Create a comprehensive test script
+    test_script = f'''#!/bin/bash
+set -e
+echo "=== GCP Batch Connectivity Test ==="
+echo "Test Type: {args.test_type}"
+echo "Target: {target_host}:{target_port}"
+echo "Timestamp: $(date)"
+echo "Container hostname: $(hostname)"
+echo ""
+
+# Basic network info
+echo "=== Network Information ==="
+echo "Container IP addresses:"
+hostname -I
+echo "Default route:"
+ip route | grep default || echo "No default route found"
+echo ""
+
+# DNS configuration
+echo "=== DNS Configuration ==="
+echo "DNS servers:"
+cat /etc/resolv.conf | grep nameserver || echo "No nameservers found"
+echo ""
+
+# Test DNS resolution of target
+echo "=== DNS Resolution Test ==="
+echo "Resolving {target_host}:"
+nslookup {target_host} || {{
+    echo "DNS resolution failed for {target_host}"
+    echo "Trying with Google DNS (8.8.8.8):"
+    nslookup {target_host} 8.8.8.8 || echo "DNS resolution failed even with Google DNS"
+}}
+echo ""
+
+# Basic connectivity test
+echo "=== Primary Connectivity Test ==="
+echo "Testing connection to {target_host}:{target_port}..."
+timeout 30 nc -z -v -w 10 {target_host} {target_port}
+nc_result=$?
+echo "Netcat result: $nc_result"
+echo ""
+
+# Additional connectivity tests
+echo "=== Additional Connectivity Tests ==="
+echo "Testing Google DNS (8.8.8.8:53):"
+timeout 10 nc -z -v -w 5 8.8.8.8 53 && echo "✓ External DNS reachable" || echo "✗ External DNS unreachable"
+
+echo "Testing Kubernetes API (if accessible):"
+timeout 10 nc -z -v -w 5 kubernetes.default.svc.cluster.local 443 2>/dev/null && echo "✓ Kubernetes API reachable" || echo "✗ Kubernetes API unreachable"
+
+echo ""
+echo "=== Network Troubleshooting ==="
+echo "Route table:"
+ip route
+echo ""
+echo "ARP table:"
+arp -a 2>/dev/null || echo "ARP command not available"
+echo ""
+
+echo "=== Final Result ==="
+if [ $nc_result -eq 0 ]; then
+    echo "✓ SUCCESS: Connection to {target_host}:{target_port} successful"
+    exit 0
+else
+    echo "✗ FAILED: Connection to {target_host}:{target_port} failed"
+    echo "This suggests a network connectivity issue between GCP Batch and the target service."
+    echo "Common causes:"
+    echo "- Firewall rules blocking traffic"
+    echo "- Service not accessible from external networks"
+    echo "- Target service only accepting internal cluster traffic"
+    exit 1
+fi
+'''
+
+    runnable.container.entrypoint = "/bin/bash"
+    runnable.container.commands = ["-c", test_script]
+    logger.debug(f"Container config: image={runnable.container.image_uri}, entrypoint={runnable.container.entrypoint}")
 
     task = batch_v1.TaskSpec()
     task.runnables = [runnable]
@@ -154,7 +300,7 @@
     logger.info(f"Submitting job with name: {job_name}")
     logger.info(f"Target project: {project_id}")
     logger.info(f"Target Batch region: {args.region}")
-    logger.info(f"NFS target: {nfs_address}:2049")
+    logger.info(f"Test target: {target_host}:{target_port}")
 
     # Proceed with job submission
     try:
@@ -171,7 +317,10 @@
             f.write(f"Job UID: {job_response.uid}\n")
             f.write(f"Project: {project_id}\n")
             f.write(f"Region: {args.region}\n")
-            f.write(f"NFS Address: {nfs_address}:2049\n")
+            f.write(f"Test Type: {args.test_type}\n")
+            f.write(f"Target: {target_host}:{target_port}\n")
+            f.write(f"\nTo view job logs, run:\n")
+            f.write(f"gcloud logging read 'resource.type=gce_instance AND resource.labels.instance_id={job_name}' --project={project_id}\n")
 
     except Exception as e:
         logger.error(f"Error submitting job: {type(e).__name__}: {e}")
@@ -185,6 +334,8 @@
             f.write(f"Job name: {job_name}\n")
             f.write(f"Project: {project_id}\n")
             f.write(f"Region: {args.region}\n")
+            f.write(f"Test Type: {args.test_type}\n")
+            f.write(f"Target: {target_host}:{target_port}\n")
             f.write(f"Traceback:\n")
             f.write(traceback.format_exc())