#!/bin/bash

# This script provisions a multi-zone OpenShift cluster on IBM Cloud VPC
# with standard, persistent, and optional GPU worker pools.
#
# Prerequisites: ibmcloud CLI, kubernetes-service plugin, and jq.

# Exit immediately if a command exits with a non-zero status.
set -e
# Print commands and their arguments as they are executed.
set -x

# --- CONFIGURATION ---

# A unique name for your cluster.
CLUSTER_NAME="samplecluster"

# The OpenShift version.
OCP_VERSION="4.17.40_openshift"

# The ID of your VPC.
VPC_ID="r006-cac4bfbe-d04d-481a-a099-ba243ea64afd"

# The CRN of your Cloud Object Storage instance.
COS_INSTANCE_CRN="crn:v1:bluemix:public:cloud-object-storage:global:a/a3fe2c01c5c843e3b79feac65f02afab:d6365d00-32d8-4d65-b208-5d106c9313a5::"

# --- Worker Node Flavors ---
# Flavor for the initial default pool.
DEFAULT_POOL_FLAVOR="bx3d.4x20"
# Flavor for the general, persistent, and standard pipelines pools.
STANDARD_POOL_FLAVOR="cx2.8x16"
# Flavor for the optional L4 GPU pipelines pool.
L4_GPU_FLAVOR="gx3.16x80.l4"

# --- Multi-Zone Configuration (3 Zones) ---
# Zone 1
ZONE_1="us-south-1"
SUBNET_ID_1="0717-6f46918e-2107-48ae-b023-eb053601697b"

# Zone 2
ZONE_2="us-south-2"
SUBNET_ID_2="0727-63b76f23-132d-42dd-af5c-8d5534aaebea"

# Zone 3
ZONE_3="us-south-3"
SUBNET_ID_3="0737-2ce628b5-5cb0-41cc-9e88-4f6fb3a88151"

# --- Optional L4 GPU Pool ---
# Set to 'true' to create the pipelines-l4 pool, or 'false' to skip it.
CREATE_L4_PIPELINES_POOL=true

L4_POOL_MIN_WORKERS=1
L4_POOL_MAX_WORKERS=2

# Max workers *per zone* for the standard pipelines pool
PIPELINES_MIN_WORKERS=1
PIPELINES_MAX_WORKERS=3

# --- HELPER FUNCTION ---
# Waits for worker nodes in a specific pool to reach the 'normal' state.
function waitForWorkers() {
  local CLUSTER_NAME="$1"
  local POOL_NAME="$2"
  local EXPECTED_COUNT="$3"
  echo "--> Waiting for $EXPECTED_COUNT worker(s) in pool '$POOL_NAME' to become 'normal'..."
  while true; do
    local NORMAL_COUNT
    NORMAL_COUNT=$(ibmcloud oc worker ls --cluster "$CLUSTER_NAME" --worker-pool "$POOL_NAME" --output json | jq -r '[.[] | select(.health.message=="Ready")] | length')
    if [[ "$NORMAL_COUNT" -ge "$EXPECTED_COUNT" ]]; then
      echo "--> Success: $NORMAL_COUNT of $EXPECTED_COUNT worker(s) are 'normal' in pool '$POOL_NAME'."
      break
    else
      echo "--> $NORMAL_COUNT of $EXPECTED_COUNT workers are 'normal'. Checking again in 30 seconds..."
      sleep 30
    fi
  done
}

echo "Step 1: Starting OpenShift cluster creation for $CLUSTER_NAME..."

ibmcloud oc cluster create vpc-gen2 \
 --name "$CLUSTER_NAME" \
 --version "$OCP_VERSION" \
 --vpc-id "$VPC_ID" \
 --subnet-id "$SUBNET_ID_1" \
 --flavor "$DEFAULT_POOL_FLAVOR" \
 --workers 2 \
 --zone "$ZONE_1" \
 --cos-instance "$COS_INSTANCE_CRN"

echo "Step 2: Waiting for cluster to be in 'normal' state. This will take 40+ minutes..."

while [[ $(ibmcloud oc cluster get --cluster $CLUSTER_NAME --output json | jq -r .state) != "normal" ]]; do
  echo "Cluster is not ready yet. Current state: $(ibmcloud oc cluster get --cluster $CLUSTER_NAME --output json | jq -r .state). Checking again in 60 seconds."
  sleep 60
done

echo "Cluster is ready. Proceeding with worker pool configuration."

# --- Create General Worker Pool ---
echo "Step 3: Creating 'general' worker pool with flavor $STANDARD_POOL_FLAVOR..."
ibmcloud oc worker-pool create vpc-gen2 --cluster "$CLUSTER_NAME" --name general --flavor "$STANDARD_POOL_FLAVOR" --size-per-zone 1 --label wallaroo.ai/node-purpose=general
echo "Adding zones to 'general' pool..."
ibmcloud oc zone add vpc-gen2 --zone "$ZONE_1" --subnet-id "$SUBNET_ID_1" --cluster "$CLUSTER_NAME" --worker-pool general
ibmcloud oc zone add vpc-gen2 --zone "$ZONE_2" --subnet-id "$SUBNET_ID_2" --cluster "$CLUSTER_NAME" --worker-pool general
ibmcloud oc zone add vpc-gen2 --zone "$ZONE_3" --subnet-id "$SUBNET_ID_3" --cluster "$CLUSTER_NAME" --worker-pool general


# --- Create Persistent Worker Pool (and apply taint) ---
echo "Step 4: Creating 'persistent' worker pool..."
ibmcloud oc worker-pool create vpc-gen2 --cluster "$CLUSTER_NAME" --name persistent --flavor "$STANDARD_POOL_FLAVOR" --size-per-zone 1 --label wallaroo.ai/node-purpose=persistent
ibmcloud oc zone add vpc-gen2 --zone "$ZONE_1" --subnet-id "$SUBNET_ID_1" --cluster "$CLUSTER_NAME" --worker-pool persistent

# --- Create Standard Pipelines Worker Pool (and apply taint) ---
echo "Step 5: Creating 'pipelines' worker pool..."
ibmcloud oc worker-pool create vpc-gen2 --cluster "$CLUSTER_NAME" --name pipelines --flavor "$STANDARD_POOL_FLAVOR" --size-per-zone 1 --label wallaroo.ai/node-purpose=pipelines
ibmcloud oc zone add vpc-gen2 --zone "$ZONE_1" --subnet-id "$SUBNET_ID_1" --cluster "$CLUSTER_NAME" --worker-pool pipelines
ibmcloud oc zone add vpc-gen2 --zone "$ZONE_2" --subnet-id "$SUBNET_ID_2" --cluster "$CLUSTER_NAME" --worker-pool pipelines
ibmcloud oc zone add vpc-gen2 --zone "$ZONE_3" --subnet-id "$SUBNET_ID_3" --cluster "$CLUSTER_NAME" --worker-pool pipelines

# --- Create Optional L4 GPU Pipelines Worker Pool (and apply taints) ---
if [ "$CREATE_L4_PIPELINES_POOL" = true ]; then
  echo "Step 6: Creating optional 'pipelines-l4' GPU worker pool..."
  ibmcloud oc worker-pool create vpc-gen2 --cluster "$CLUSTER_NAME" --name pipelines-l4 --flavor "$L4_GPU_FLAVOR" --size-per-zone 1 \
    --label wallaroo.ai/node-purpose=pipelines \
    --label wallaroo.ai/accelerator=l4
  ibmcloud oc zone add vpc-gen2 --zone "$ZONE_1" --subnet-id "$SUBNET_ID_1" --cluster "$CLUSTER_NAME" --worker-pool pipelines-l4
fi

# --- Wait for all worker nodes to finish provisioning ---
echo "Step 4: Waiting for all worker pools to finish provisioning..."
waitForWorkers "$CLUSTER_NAME" "general" 3
waitForWorkers "$CLUSTER_NAME" "persistent" 1
waitForWorkers "$CLUSTER_NAME" "pipelines" 3
if [ "$CREATE_L4_PIPELINES_POOL" = true ]; then
  waitForWorkers "$CLUSTER_NAME" "pipelines-l4" 1
fi

ibmcloud oc cluster config --cluster "$CLUSTER_NAME" --admin

# --- Apply Taints using kubectl ---
echo "Step 5: Applying taints to worker pools..."
echo "Waiting for nodes in 'persistent' pool to be ready before tainting..."
kubectl wait --for=condition=Ready node -l ibm-cloud.kubernetes.io/worker-pool-name=persistent --timeout=10m

echo "Tainting 'persistent' pool nodes..."
kubectl taint nodes -l ibm-cloud.kubernetes.io/worker-pool-name=persistent wallaroo.ai/persistent=true:NoSchedule --overwrite

echo "Waiting for nodes in 'pipelines' pool to be ready before tainting..."
kubectl wait --for=condition=Ready node -l ibm-cloud.kubernetes.io/worker-pool-name=pipelines --timeout=10m

echo "Tainting 'pipelines' pool nodes..."
kubectl taint nodes -l ibm-cloud.kubernetes.io/worker-pool-name=pipelines wallaroo.ai/pipelines=true:NoSchedule --overwrite

if [ "$CREATE_L4_PIPELINES_POOL" = true ]; then
  echo "Waiting for nodes in 'pipelines-l4' pool to be ready before tainting..."
  kubectl wait --for=condition=Ready node -l ibm-cloud.kubernetes.io/worker-pool-name=pipelines-l4 --timeout=15m

  echo "Tainting 'pipelines-l4' pool nodes..."
  kubectl taint nodes -l ibm-cloud.kubernetes.io/worker-pool-name=pipelines-l4 wallaroo.ai/pipelines=true:NoSchedule --overwrite
  kubectl taint nodes -l ibm-cloud.kubernetes.io/worker-pool-name=pipelines-l4 nvidia.com/gpu=l4:NoSchedule --overwrite
fi

# --- Configure Autoscaling ---
echo "Step 6: Enabling and configuring cluster autoscaler..."
ibmcloud oc cluster addon enable cluster-autoscaler --cluster "$CLUSTER_NAME"
echo "Waiting for autoscaler addon to become active..."
while [[ $(ibmcloud oc cluster addon ls --cluster $CLUSTER_NAME --output json | jq -r '.[] | select(.name=="cluster-autoscaler") | .healthState') != "normal" ]]; do
  echo "Autoscaler addon is not active yet. Checking again in 20 seconds."
  sleep 20
done

echo "Building autoscaler configuration using jq..."
JSON_CONFIG=$(jq -n \
  --argjson pipelines_min "$PIPELINES_MIN_WORKERS" \
  --argjson pipelines_max "$PIPELINES_MAX_WORKERS" \
  --arg create_l4 "$CREATE_L4_PIPELINES_POOL" \
  --argjson l4_min "$L4_POOL_MIN_WORKERS" \
  --argjson l4_max "$L4_POOL_MAX_WORKERS" \
  '
  # Start with the static pools
  [
      {"name":"general", "minSize":1, "maxSize":1, "enabled":false},
      {"name":"persistent", "minSize":1, "maxSize":1, "enabled":false},
      {"name":"pipelines", "minSize":$pipelines_min, "maxSize":$pipelines_max, "enabled":true}
  ]
  # Conditionally add the L4 pool
  | if $create_l4 == "true" then
      . + [{"name":"pipelines-l4", "minSize":$l4_min, "maxSize":$l4_max, "enabled":true}]
    else
      .
    end
  '
)


# --- Final Cleanup ---
echo "Step 7: Removing the temporary 'default' worker pool..."
waitForWorkers "$CLUSTER_NAME" "default" 2
ibmcloud oc worker-pool rm --cluster "$CLUSTER_NAME" --worker-pool default -f

echo "---"
echo "✅ SUCCESS: Cluster $CLUSTER_NAME is provisioned and configured."
echo "Run the following command to connect to your cluster:"
echo "ibmcloud oc cluster config --cluster $CLUSTER_NAME --admin"
echo "---"
