"""CLI commands for Hugging Face Inference Endpoints.""" import json from typing import Annotated, Optional import typer from huggingface_hub._inference_endpoints import InferenceEndpoint, InferenceEndpointScalingMetric from huggingface_hub.errors import HfHubHTTPError from ._cli_utils import TokenOpt, get_hf_api, typer_factory ie_cli = typer_factory(help="Manage Hugging Face Inference Endpoints.") catalog_app = typer_factory(help="Interact with the Inference Endpoints catalog.") NameArg = Annotated[ str, typer.Argument(help="Endpoint name."), ] NameOpt = Annotated[ Optional[str], typer.Option(help="Endpoint name."), ] NamespaceOpt = Annotated[ Optional[str], typer.Option( help="The namespace associated with the Inference Endpoint. Defaults to the current user's namespace.", ), ] def _print_endpoint(endpoint: InferenceEndpoint) -> None: typer.echo(json.dumps(endpoint.raw, indent=2, sort_keys=True)) @ie_cli.command() def ls( namespace: NamespaceOpt = None, token: TokenOpt = None, ) -> None: """Lists all Inference Endpoints for the given namespace.""" api = get_hf_api(token=token) try: endpoints = api.list_inference_endpoints(namespace=namespace, token=token) except HfHubHTTPError as error: typer.echo(f"Listing failed: {error}") raise typer.Exit(code=error.response.status_code) from error typer.echo( json.dumps( {"items": [endpoint.raw for endpoint in endpoints]}, indent=2, sort_keys=True, ) ) @ie_cli.command(name="deploy") def deploy( name: NameArg, repo: Annotated[ str, typer.Option( help="The name of the model repository associated with the Inference Endpoint (e.g. 'openai/gpt-oss-120b').", ), ], framework: Annotated[ str, typer.Option( help="The machine learning framework used for the model (e.g. 'vllm').", ), ], accelerator: Annotated[ str, typer.Option( help="The hardware accelerator to be used for inference (e.g. 'cpu').", ), ], instance_size: Annotated[ str, typer.Option( help="The size or type of the instance to be used for hosting the model (e.g. 'x4').", ), ], instance_type: Annotated[ str, typer.Option( help="The cloud instance type where the Inference Endpoint will be deployed (e.g. 'intel-icl').", ), ], region: Annotated[ str, typer.Option( help="The cloud region in which the Inference Endpoint will be created (e.g. 'us-east-1').", ), ], vendor: Annotated[ str, typer.Option( help="The cloud provider or vendor where the Inference Endpoint will be hosted (e.g. 'aws').", ), ], *, namespace: NamespaceOpt = None, task: Annotated[ Optional[str], typer.Option( help="The task on which to deploy the model (e.g. 'text-classification').", ), ] = None, token: TokenOpt = None, min_replica: Annotated[ int, typer.Option( help="The minimum number of replicas (instances) to keep running for the Inference Endpoint.", ), ] = 1, max_replica: Annotated[ int, typer.Option( help="The maximum number of replicas (instances) to scale to for the Inference Endpoint.", ), ] = 1, scale_to_zero_timeout: Annotated[ Optional[int], typer.Option( help="The duration in minutes before an inactive endpoint is scaled to zero.", ), ] = None, scaling_metric: Annotated[ Optional[InferenceEndpointScalingMetric], typer.Option( help="The metric reference for scaling.", ), ] = None, scaling_threshold: Annotated[ Optional[float], typer.Option( help="The scaling metric threshold used to trigger a scale up. Ignored when scaling metric is not provided.", ), ] = None, ) -> None: """Deploy an Inference Endpoint from a Hub repository.""" api = get_hf_api(token=token) endpoint = api.create_inference_endpoint( name=name, repository=repo, framework=framework, accelerator=accelerator, instance_size=instance_size, instance_type=instance_type, region=region, vendor=vendor, namespace=namespace, task=task, token=token, min_replica=min_replica, max_replica=max_replica, scaling_metric=scaling_metric, scaling_threshold=scaling_threshold, scale_to_zero_timeout=scale_to_zero_timeout, ) _print_endpoint(endpoint) @catalog_app.command(name="deploy") def deploy_from_catalog( repo: Annotated[ str, typer.Option( help="The name of the model repository associated with the Inference Endpoint (e.g. 'openai/gpt-oss-120b').", ), ], name: NameOpt = None, namespace: NamespaceOpt = None, token: TokenOpt = None, ) -> None: """Deploy an Inference Endpoint from the Model Catalog.""" api = get_hf_api(token=token) try: endpoint = api.create_inference_endpoint_from_catalog( repo_id=repo, name=name, namespace=namespace, token=token, ) except HfHubHTTPError as error: typer.echo(f"Deployment failed: {error}") raise typer.Exit(code=error.response.status_code) from error _print_endpoint(endpoint) def list_catalog( token: TokenOpt = None, ) -> None: """List available Catalog models.""" api = get_hf_api(token=token) try: models = api.list_inference_catalog(token=token) except HfHubHTTPError as error: typer.echo(f"Catalog fetch failed: {error}") raise typer.Exit(code=error.response.status_code) from error typer.echo(json.dumps({"models": models}, indent=2, sort_keys=True)) catalog_app.command(name="ls")(list_catalog) ie_cli.command(name="list-catalog", help="List available Catalog models.", hidden=True)(list_catalog) ie_cli.add_typer(catalog_app, name="catalog") @ie_cli.command() def describe( name: NameArg, namespace: NamespaceOpt = None, token: TokenOpt = None, ) -> None: """Get information about an existing endpoint.""" api = get_hf_api(token=token) try: endpoint = api.get_inference_endpoint(name=name, namespace=namespace, token=token) except HfHubHTTPError as error: typer.echo(f"Fetch failed: {error}") raise typer.Exit(code=error.response.status_code) from error _print_endpoint(endpoint) @ie_cli.command() def update( name: NameArg, namespace: NamespaceOpt = None, repo: Annotated[ Optional[str], typer.Option( help="The name of the model repository associated with the Inference Endpoint (e.g. 'openai/gpt-oss-120b').", ), ] = None, accelerator: Annotated[ Optional[str], typer.Option( help="The hardware accelerator to be used for inference (e.g. 'cpu').", ), ] = None, instance_size: Annotated[ Optional[str], typer.Option( help="The size or type of the instance to be used for hosting the model (e.g. 'x4').", ), ] = None, instance_type: Annotated[ Optional[str], typer.Option( help="The cloud instance type where the Inference Endpoint will be deployed (e.g. 'intel-icl').", ), ] = None, framework: Annotated[ Optional[str], typer.Option( help="The machine learning framework used for the model (e.g. 'custom').", ), ] = None, revision: Annotated[ Optional[str], typer.Option( help="The specific model revision to deploy on the Inference Endpoint (e.g. '6c0e6080953db56375760c0471a8c5f2929baf11').", ), ] = None, task: Annotated[ Optional[str], typer.Option( help="The task on which to deploy the model (e.g. 'text-classification').", ), ] = None, min_replica: Annotated[ Optional[int], typer.Option( help="The minimum number of replicas (instances) to keep running for the Inference Endpoint.", ), ] = None, max_replica: Annotated[ Optional[int], typer.Option( help="The maximum number of replicas (instances) to scale to for the Inference Endpoint.", ), ] = None, scale_to_zero_timeout: Annotated[ Optional[int], typer.Option( help="The duration in minutes before an inactive endpoint is scaled to zero.", ), ] = None, scaling_metric: Annotated[ Optional[InferenceEndpointScalingMetric], typer.Option( help="The metric reference for scaling.", ), ] = None, scaling_threshold: Annotated[ Optional[float], typer.Option( help="The scaling metric threshold used to trigger a scale up. Ignored when scaling metric is not provided.", ), ] = None, token: TokenOpt = None, ) -> None: """Update an existing endpoint.""" api = get_hf_api(token=token) try: endpoint = api.update_inference_endpoint( name=name, namespace=namespace, repository=repo, framework=framework, revision=revision, task=task, accelerator=accelerator, instance_size=instance_size, instance_type=instance_type, min_replica=min_replica, max_replica=max_replica, scale_to_zero_timeout=scale_to_zero_timeout, scaling_metric=scaling_metric, scaling_threshold=scaling_threshold, token=token, ) except HfHubHTTPError as error: typer.echo(f"Update failed: {error}") raise typer.Exit(code=error.response.status_code) from error _print_endpoint(endpoint) @ie_cli.command() def delete( name: NameArg, namespace: NamespaceOpt = None, yes: Annotated[ bool, typer.Option("--yes", help="Skip confirmation prompts."), ] = False, token: TokenOpt = None, ) -> None: """Delete an Inference Endpoint permanently.""" if not yes: confirmation = typer.prompt(f"Delete endpoint '{name}'? Type the name to confirm.") if confirmation != name: typer.echo("Aborted.") raise typer.Exit(code=2) api = get_hf_api(token=token) try: api.delete_inference_endpoint(name=name, namespace=namespace, token=token) except HfHubHTTPError as error: typer.echo(f"Delete failed: {error}") raise typer.Exit(code=error.response.status_code) from error typer.echo(f"Deleted '{name}'.") @ie_cli.command() def pause( name: NameArg, namespace: NamespaceOpt = None, token: TokenOpt = None, ) -> None: """Pause an Inference Endpoint.""" api = get_hf_api(token=token) try: endpoint = api.pause_inference_endpoint(name=name, namespace=namespace, token=token) except HfHubHTTPError as error: typer.echo(f"Pause failed: {error}") raise typer.Exit(code=error.response.status_code) from error _print_endpoint(endpoint) @ie_cli.command() def resume( name: NameArg, namespace: NamespaceOpt = None, fail_if_already_running: Annotated[ bool, typer.Option( "--fail-if-already-running", help="If `True`, the method will raise an error if the Inference Endpoint is already running.", ), ] = False, token: TokenOpt = None, ) -> None: """Resume an Inference Endpoint.""" api = get_hf_api(token=token) try: endpoint = api.resume_inference_endpoint( name=name, namespace=namespace, token=token, running_ok=not fail_if_already_running, ) except HfHubHTTPError as error: typer.echo(f"Resume failed: {error}") raise typer.Exit(code=error.response.status_code) from error _print_endpoint(endpoint) @ie_cli.command() def scale_to_zero( name: NameArg, namespace: NamespaceOpt = None, token: TokenOpt = None, ) -> None: """Scale an Inference Endpoint to zero.""" api = get_hf_api(token=token) try: endpoint = api.scale_to_zero_inference_endpoint(name=name, namespace=namespace, token=token) except HfHubHTTPError as error: typer.echo(f"Scale To Zero failed: {error}") raise typer.Exit(code=error.response.status_code) from error _print_endpoint(endpoint)