Determined HCVs

Values

determined:
  enabled: false
  detVersion: "0.35.0"
  # The image registry to be used to pull the Master image.
  # Determined OSS edition uses the determinedai repository in DockerHub.
  imageRegistry: determinedai
  # HPE Machine Learning Development Environment (MLDE), Determined Enterprise Edition, uses the HPE MSC as the image registry
  #imageRegistry: hub.myenterpriselicense.hpe.com/hpe-mlde/<SKU>
  # ATTENTION
  # Please also set:
  #   - communicated product SKU,
  #   - enterpriseEdition flag to true,
  # and configure the imagePullSecretName to the HPE MSC credentials K8s Secret (e.g. mlde-hpe-registry)
  #
  # To get the HPE MSC credentials go to the myenterpriselicense.hpe.com website, and along with the information provided with your order
  # create the HPE MSC credentials K8s Secret (e.g. mlde-hpe-registry) using the following command:
  # kubectl create secret docker-registry mlde-hpe-registry  \
  # --docker-server=hub.myenterpriselicense.hpe.com/hpe-mlde/<SKU> \
  # --docker-username=<HPE MSC user name>  \
  # --docker-password=<HPE MSC MLDE license key> \
  # --docker-email=<HPE MSC user email> \
  # -n <MLDE deployment K8s namespace, if any>

  # Default images used during the deployment
  defaultImages:
    # PostgreSQL image
    postgreSQL: "postgres:10.14"

    # default Kube Scheduler image
    kubeScheduler: "k8s.gcr.io/scheduler-plugins/kube-scheduler:v0.18.9"

    # Kube Scheduler used when the K8s default scheduler is set to preemption
    # when, defaultScheduler: preemption
    kubeSchedulerPreemption: "determinedai/kube-scheduler:0.17.0"

    # default images for CPU and GPU environments

    cpuImage: "determinedai/pytorch-ngc:0.35.0"
    gpuImage: "determinedai/pytorch-ngc:0.35.0"

  # Install Determined enterprise edition.
  enterpriseEdition: false

  # Should be configured if using the master image in the Determined enterprise edition
  # or private registry.
  imagePullSecretName: ""

  # Logger Level in master.yaml - Four severity levels: debug, info, warn, error
  logLevel: info
  # Sets in master.yaml the output of Logger in color mode - Values: true (default), false
  logColor: true

  # masterPort configures the port at which the Determined master listens for connections on.
  masterPort: 8080

  # Enables the creation of non-namespaced objects - Default: true
  # Non-namespaced object are cluster-wide resources, such as the PriorityClasses.
  # In multiple installation on a single cluster (using different namespaces),
  # this flag set to false avoids to recreate non-namespaced objects. In some cases (e.g., GitOps w/ArgoCD)
  # creating existing cluster-wide resources could stop/hang automatic deployments.
  #
  # WARNING
  # The first installation must run with the createNonNamespacedObjects flag set to true to ensure
  # the non-namespaced objects are created.
  # When deploying multiple namespaces within the same shared cluster, this should be set to false
  # with helm overrides.
  createNonNamespacedObjects: true

  # Integration with Pachyderm. Change the pachyderm address only if determined and pachyderm are deployed in different namespaces.
  integrations:
    pachyderm:
      address: "" # Defaults to grpc://pachd.<releaseNamespace>.svc.cluster.local:30650

  # External ca.crt injection certificate/s secret name
  # Command to create the ca cert secret:
  #     kubectl create secret generic <external ca cert secret name, e.g., ext-ca-cert> --from-file=<ca.crt or ca bundle filename> -n <namespace>
  #
  # externalCaCertSecretName: <external ca cert secret name, e.g., ext-ca-cert>

  # When useNodePortForMaster is set to false (default), a LoadBalancer service is deployed to make
  # the Determined master reachable from outside the cluster. When useNodePortForMaster is set to
  # true, the master will instead be exposed behind a NodePort service. When using a NodePort service
  # users will typically have to configure an Ingress to make the Determined master reachable from
  # outside the cluster. NodePort service is recommended when configuring TLS termination in a
  # load-balancer.
  useNodePortForMaster: false

  # Enable route support for Openshift by setting enabled to true. Configure tls termination (i.e edge) if needed.
  # openshiftRoute:
  # enabled:
  # host:
  # termination:

  # tlsSecret enables TLS encryption for all communication made to the Determined master (TLS
  # termination is performed in the Determined master). This includes communication between the
  # Determined master and the task containers it launches, but does not include communication between
  # the task containers (distributed training). The specified Secret of type tls must already exist in
  # the same namespace in which Determined is being installed.
  # tlsSecret:

  # security:
  # defaultTask sets the user and group that tasks will run as. For convenience, the default Determined
  # environments contain an unprivileged user named det-nobody, which does have a writable HOME
  # directory. The det-nobody user is a suitable default user when using the default Determined
  # environment images and when running containers as root is not desired.
  # defaultTask:
  #   user: det-nobody
  #   uid: 65533
  #   group: det-nobody
  #   gid: 65533
  # authz option (EE-only) sets the authorization mode.
  # authz:
  #   type: rbac

  # oidc (EE-only) enables OpenID Connect Integration, which is only available if enterpriseEdition
  # is true. It allows users to use single sign-on with their organization’s identity provider.
  # clientSecretKey is the key of the secret contained in the secret.
  oidc:
    enabled: false
    provider: ""
    idpRecipientUrl: ""
    idpSsoUrl: ""
    clientId: ""
    clientSecretKey: ""
    clientSecretName: ""
    authenticationClaim: ""
    scimAuthenticationAttribute: ""
    autoProvisionUsers: false
    groupsAttributeName: ""
    displayNameAttributeName: ""
    alwaysRedirect: false

  # scim (EE-only) enables System for Cross-domain Identity Management (SCIM) integration, which is
  # only available if enterpriseEdition is true. It allows administrators to easily and securely
  # provision users and groups through their standard identity provider (IdP).
  # scim:
  #   enabled: true
  #   auth:
  #     type: basic
  #     username: determined
  #     password: password

  # db sets the configurations for the database.
  db:
    # To deploy your own Postgres DB, provide a hostAddress. If hostAddress is provided, Determined
    # will skip deploying a Postgres DB.
    # hostAddress:

    # Required parameters, whether you are using your own DB or a Determined DB.
    name: determined
    user: postgres
    password: postgres
    port: 5432

    # Only used for Determined DB deployment. Configures the size of the PersistentVolumeClaim for the
    # Determined deployed database, as well as the CPU and memory requirements. Should be adjusted for
    # scale.
    storageSize: 30Gi
    cpuRequest: "2"
    memRequest: 8Gi
    #  cpuLimit: 2
    #  memLimit: 8Gi

    # useNodePortForDB configures whether ClusterIP or NodePort service type is used for the
    # Determined deployed DB. By default ClusterIP is used.
    useNodePortForDB: false

    # storageClassName configures the StorageClass used by the PersistentVolumeClaim for the
    # Determined deployed database. This can be left blank if a default storage class is specified in
    # the cluster. If dynamic provisioning of PersistentVolumes is disabled, users must manually
    # create a PersistentVolume that will match the PersistentVolumeClaim.
    # storageClassName:

    # ssl_mode and ssl_root_cert configure the TLS connection to the database. Users must first
    # create a kubernetes secret or configMap containing their certificate and specify its name in
    # certResourceName. For sslRootCert, specify the name of the file only (not path).
    # sslMode: verify-ca
    # sslRootCert: <cert_name>
    # resourceType: <secret/configMap>
    # certResourceName: <secret/configMap name>

  # checkpointStorage controls where checkpoints are stored. Supported types include `shared_fs`,
  # `gcs`, and `s3`.
  checkpointStorage:
    # Applicable to all checkpointStorage types.
    saveExperimentBest: 0
    saveTrialBest: 1
    saveTrialLatest: 1

    # Comment out if not using `shared_fs`. Users are strongly discouraged from using `shared_fs` for
    # storage beyond initial testing as most Kubernetes cluster nodes do not have a shared file
    # system.
    type: shared_fs
    hostPath: /tmp/checkpoints
    # By default, shared_fs is not mounted to the server pod. Change this to true to enable checkpoint downloads from the server.
    mountToServer: false

    # For storing in GCS.
    # type: gcs
    # bucket: <bucket_name>
    # prefix: <prefix>

    # For storing in S3.
    # type: s3
    # bucket: <bucket_name>
    # accessKey: <access_key>
    # secretKey: <secret_key>
    # endpointUrl: <endpoint_url>
    # prefix: <prefix>

    # For storing in Azure Blob Storage with a connection string.
    # Do NOT use if already using Azure Blob Storage with account URL
    # type: azure
    # container: <container_name>
    # connection_string: <connection_string>

    # For storing in Azure Blob Storage with an account URL.
    # Do NOT use if already using Azure Blob Storage with connection string.
    # The `credential` field is optional.
    # type: azure
    # container: <container_name>
    # account_url: <account_url>
    # credential: <credential>

  # This is the number of GPUs there are per machine. Determined uses this information when scheduling
  # multi-GPU tasks. Each multi-GPU (distributed training) task will be scheduled as a set of
  # `slotsPerTask / maxSlotsPerPod` separate pods, with each pod assigned up to `maxSlotsPerPod` GPUs.
  # Distributed tasks with sizes that are not divisible by `maxSlotsPerPod` are never scheduled. If
  # you have a cluster of different size nodes (e.g., 4 and 8 GPUs per node), set `maxSlotsPerPod` to
  # the greatest common divisor of all the sizes (4, in that case).
  # maxSlotsPerPod:

  ## For CPU-only clusters, use `slotType: cpu`, and make sure to set `slotResourceRequest` below.
  # slotType: cpu
  # slotResourceRequests:
  ## Number of cpu units requested for compute slots. Note: since kubernetes may schedule some
  ## system tasks on the nodes which take up some resources, 8-core node may not always fit
  ## a `cpu: 8` task container.
  # cpu: 7

  # Memory and CPU requirements for the master instance. Should be adjusted for scale.
  masterCpuRequest: "2"
  masterMemRequest: 8Gi
  # masterCpuLimit: "2"
  # masterMemLimit: 8Gi

  ## Configure the task container defaults. Tasks include trials, commands, TensorBoards, notebooks,
  ## and shells. For all task containers, shm_size_bytes and network_mode are configurable. For
  ## trials, the network interface used by distributed (multi-machine) training is configurable.
  taskContainerDefaults:
    # networkMode: bridge
    # dtrainNetworkInterface: "<network interface name>"
    forcePullImage: false

    # Configure a default pod spec for all GPU tasks (experiments, notebooks, commands) and CPU tasks
    # (CPU notebooks, TensorBoards, zero-slot commands). If a pod spec is defined for an individual
    # task, that pod spec will replace the default one that is defined here. See
    # https://docs:determined.ai/latest/topic-guides/custom-pod-specs.html for more details.
    # cpuPodSpec:
    # gpuPodSpec:

    # Configure default Docker images for all GPU tasks (experiments, notebooks, commands) and
    # CPU tasks (CPU notebooks, TensorBoards, zero-slot commands). If a Docker image is defined
    # for an individual task, that image will replace the default one that is defined here.
    # If specifying a default image, both GPU and CPU default images must be defined.
    # cpuImage:
    # gpuImage:

    # Configure an inline script that will be executed as part of the task setup process that will install
    # the Pachyderm notebook extension in det launched notebooks.
    startupHook: |
      if [[ -n "$PACHD_ADDRESS" && "$SKIP_PACHYDERM_INSTALL" != "true" && "$DET_TASK_TYPE" == "NOTEBOOK" ]]; then
          proxy_dns=$(echo $PACHD_ADDRESS | sed 's/grpc:\/\/pachd/pachyderm-proxy/' | sed 's/:30650//') 
          version=$(curl -skLX POST -H "Content-Type: application/json" $proxy_dns/api/versionpb_v2.API/GetVersion | sed -n 's/.*"major": *\([0-9]*\), *"minor": *\([0-9]*\), *"micro": *\([0-9]*\), *"additional": *"\([^"]*\)".*/\1.\2.\3\4/p')
          jupyterlab_version=$(echo $version | sed 's/-alpha\.\([0-9]*\)/a\1/;s/-rc\.\([0-9]*\)/rc\1/')
          pip install jupyterlab-pachyderm==$jupyterlab_version
          # Detect the architecture
          architecture=$(uname -m)
          case $architecture in
              x86_64)
                  echo "Detected AMD64 architecture"
                  curl -L "https://github.com/pachyderm/pachyderm/releases/download/v${version}/pachctl_${version}_linux_amd64.tar.gz" | tar -xzv --strip-components=1 -C ../
                  ;;
              arm64 | aarch64)
                  echo "Detected ARM64 architecture"
                  curl -L "https://github.com/pachyderm/pachyderm/releases/download/v${version}/pachctl_${version}_linux_arm64.tar.gz" | tar -xzv --strip-components=1 -C ../
                  ;;
              *)
                  echo "Unsupported architecture: $architecture, set SKIP_PACHYDERM_INSTALL=true to skip pachyderm installation"
                  exit 1
                  ;;
          esac
          export PATH=/run/determined:$PATH
      fi      

  ## Configure whether we collect anonymous information about the usage of Determined.
  telemetry:
    enabled: true

  ## Configure Prometheus endpoints for monitoring.
  # observability:
  #   enable_prometheus: true

  ## A user-friendly name to identify this cluster by.
  # clusterName: Dev

  ## Specifies the duration in seconds before idle
  ## TensorBoard instances are automatically terminated.
  ## A TensorBoard instance is considered to be idle if
  ## it does not receive any HTTP traffic. The default timeout is 300 seconds (5 minutes).
  # tensorboardTimeout: 300

  ## Specifies the duration in seconds before idle notebook instances are automatically terminated.
  ## This behavior is disabled by default.
  # notebookTimeout: 1800

  ## deprecated, use initialUserPassword instead.
  # defaultPassword sets the password for the admin and determined user accounts.
  # defaultPassword:

  ## Configure how trial logs are stored.
  # logging:
  ## The backend to use. Can be `default` to send logs to the master to store in the PostgreSQL
  ## database or `elastic` to store logs in an Elasticsearch cluster (without going through the
  ## master).
  # type: default

  ## The remaining options should be provided only for the `elastic` backend.

  ## The host and port to use to connect to the Elasticsearch cluster.
  # host: <host>
  # port: <port>

  ## Authentication and TLS options for making the connection to Elasticsearch.
  # security:
  # username: <username>
  # password: <password>
  # tls:
  # enabled: true
  # skipVerify: false

  ## The name to use when verifying the certificate, if different from the name used to connect.
  # certificateName: <name>

  ## This value must contain the contents of the certificate file, not a path. It may be set
  ## directly or using `helm install --set-file logging.security.tls.certificate=<path>`.
  # certificate: <certificate contents>

  ## Configure the default Determined scheduler
  ## Currently supports "coscheduler" for gang scheduling and "preemption" for priority based
  ## scheduling with preemption
  # defaultScheduler: preemption

  ## Configure the resource pools in the Determined cluster.
  resourcePools:
    - pool_name: default
  # defaultAuxResourcePool: default
  # defaultComputeResourcePool: default

  ## Configure the initial user password for the cluster
  # initialUserPassword

  # additional_resource_managers:
  #   - resource_manager:
  #       type: kubernetes
  #       max_slots_per_pod: 1
  #       cluster_name: additional-cluster
  #       default_namespace: default
  #       kubeconfig_secret_name: additionalrm
  #       kubeconfig_secret_value: config
  #       determined_master_ip: 10.11.12.13
  #       determined_master_port: 8080
  #     resource_pools:
  #       - pool_name: additional_pool
  resourceManager:
    # Specifies the namespace in a given Kubernetes compute cluster where all workload pods will be sent by default.
    defaultNamespace: ""
    clusterName: ""