Endpoints¶

Top-level package for fastmlx.

`add_model(model_name)` `async` ¶

Add a new model to the API.

Parameters:

Name	Type	Description	Default
`model_name`	`str`	The name of the model to add.	required

Returns:

Name	Type	Description
`dict`	`dict`	A dictionary containing the status of the operation.

Source code in fastmlx/fastmlx.py

@app.post("/v1/models")
async def add_model(model_name: str):
    """
    Add a new model to the API.

    Args:
        model_name (str): The name of the model to add.

    Returns:
        dict (dict): A dictionary containing the status of the operation.
    """
    model_provider.load_model(model_name)
    return {"status": "success", "message": f"Model {model_name} added successfully"}

`chat_completion(request)` `async` ¶

Handle chat completion requests for both VLM and LM models.

Parameters:

Name	Type	Description	Default
`request`	`ChatCompletionRequest`	The chat completion request.	required

Returns:

Name	Type	Description
`ChatCompletionResponse`	`ChatCompletionResponse`	The generated chat completion response.

Raises:

Type	Description
`HTTPException(str)`	If MLX library is not available.

Source code in fastmlx/fastmlx.py

@app.post("/v1/chat/completions", response_model=ChatCompletionResponse)
async def chat_completion(request: ChatCompletionRequest):
    """
    Handle chat completion requests for both VLM and LM models.

    Args:
        request (ChatCompletionRequest): The chat completion request.

    Returns:
        ChatCompletionResponse (ChatCompletionResponse): The generated chat completion response.

    Raises:
        HTTPException (str): If MLX library is not available.
    """
    if not MLX_AVAILABLE:
        raise HTTPException(status_code=500, detail="MLX library not available")

    stream = request.stream
    model_data = model_provider.load_model(request.model)
    model = model_data["model"]
    config = model_data["config"]
    model_type = MODEL_REMAPPING.get(config["model_type"], config["model_type"])
    stop_words = get_eom_token(request.model)

    if model_type in MODELS["vlm"]:
        processor = model_data["processor"]
        image_processor = model_data["image_processor"]

        image_url = None
        chat_messages = []

        for msg in request.messages:
            if isinstance(msg.content, str):
                chat_messages.append({"role": msg.role, "content": msg.content})
            elif isinstance(msg.content, list):
                text_content = ""
                for content_part in msg.content:
                    if content_part.type == "text":
                        text_content += content_part.text + " "
                    elif content_part.type == "image_url":
                        image_url = content_part.image_url["url"]
                chat_messages.append(
                    {"role": msg.role, "content": text_content.strip()}
                )

        if not image_url and model_type in MODELS["vlm"]:
            raise HTTPException(
                status_code=400, detail="Image URL not provided for VLM model"
            )

        prompt = ""
        if model.config.model_type != "paligemma":
            prompt = apply_vlm_chat_template(processor, config, chat_messages)
        else:
            prompt = chat_messages[-1]["content"]

        if stream:
            return StreamingResponse(
                vlm_stream_generator(
                    model,
                    request.model,
                    processor,
                    image_url,
                    prompt,
                    image_processor,
                    request.max_tokens,
                    request.temperature,
                    stream_options=request.stream_options,
                ),
                media_type="text/event-stream",
            )
        else:
            # Generate the response
            output = vlm_generate(
                model,
                processor,
                image_url,
                prompt,
                image_processor,
                max_tokens=request.max_tokens,
                temp=request.temperature,
                verbose=False,
            )

    else:
        # Add function calling information to the prompt
        if request.tools and "firefunction-v2" not in request.model:
            # Handle system prompt
            if request.messages and request.messages[0].role == "system":
                pass
            else:
                # Generate system prompt based on model and tools
                prompt, user_role = get_tool_prompt(
                    request.model,
                    [tool.model_dump() for tool in request.tools],
                    request.messages[-1].content,
                )

                if user_role:
                    request.messages[-1].content = prompt
                else:
                    # Insert the system prompt at the beginning of the messages
                    request.messages.insert(
                        0, ChatMessage(role="system", content=prompt)
                    )

        tokenizer = model_data["tokenizer"]

        chat_messages = [
            {"role": msg.role, "content": msg.content} for msg in request.messages
        ]
        prompt = apply_lm_chat_template(tokenizer, chat_messages, request)

        if stream:
            return StreamingResponse(
                lm_stream_generator(
                    model,
                    request.model,
                    tokenizer,
                    prompt,
                    request.max_tokens,
                    request.temperature,
                    stop_words=stop_words,
                    stream_options=request.stream_options,
                ),
                media_type="text/event-stream",
            )
        else:
            output, token_length_info = lm_generate(
                model,
                tokenizer,
                prompt,
                request.max_tokens,
                temp=request.temperature,
                stop_words=stop_words,
            )

    # Parse the output to check for function calls
    return handle_function_calls(output, request, token_length_info)

`get_supported_models()` `async` ¶

Get a list of supported model types for VLM and LM.

Returns:

Name	Type	Description
`JSONResponse`	`json`	A JSON response containing the supported models.

Source code in fastmlx/fastmlx.py

@app.get("/v1/supported_models", response_model=SupportedModels)
async def get_supported_models():
    """
    Get a list of supported model types for VLM and LM.

    Returns:
        JSONResponse (json): A JSON response containing the supported models.
    """
    return JSONResponse(content=MODELS)

`list_models()` `async` ¶

Get list of models - provided in OpenAI API compliant format.

Source code in fastmlx/fastmlx.py

@app.get("/v1/models")
async def list_models():
    """
    Get list of models - provided in OpenAI API compliant format.
    """
    models = await model_provider.get_available_models()
    models_data = []
    for model in models:
        models_data.append(
            {
                "id": model,
                "object": "model",
                "created": int(time.time()),
                "owned_by": "system",
            }
        )
    return {"object": "list", "data": models_data}

`lm_generate(model, tokenizer, prompt, max_tokens=100, **kwargs)` ¶

Generate a complete response from the model.

Parameters:

Name	Type	Description	Default
`model`	`Module`	The language model.	required
`tokenizer`	`PreTrainedTokenizer`	The tokenizer.	required
`prompt`	`str`	The string prompt.	required
`max_tokens`	`int`	The maximum number of tokens. Default: `100`.	`100`
`verbose`	`bool`	If `True`, print tokens and timing information. Default: `False`.	required
`formatter`	`Optional[Callable]`	A function which takes a token and a probability and displays it.	required
`kwargs`		The remaining options get passed to :func:`generate_step`. See :func:`generate_step` for more details.	`{}`

Source code in fastmlx/utils.py

def lm_generate(
    model,
    tokenizer,
    prompt: str,
    max_tokens: int = 100,
    **kwargs,
) -> Union[str, Generator[str, None, None]]:
    """
    Generate a complete response from the model.

    Args:
       model (nn.Module): The language model.
       tokenizer (PreTrainedTokenizer): The tokenizer.
       prompt (str): The string prompt.
       max_tokens (int): The maximum number of tokens. Default: ``100``.
       verbose (bool): If ``True``, print tokens and timing information.
           Default: ``False``.
       formatter (Optional[Callable]): A function which takes a token and a
           probability and displays it.
       kwargs: The remaining options get passed to :func:`generate_step`.
          See :func:`generate_step` for more details.
    """
    if not isinstance(tokenizer, TokenizerWrapper):
        tokenizer = TokenizerWrapper(tokenizer)

    stop_words = kwargs.pop("stop_words", [])

    stop_words_id = (
        tokenizer._tokenizer(stop_words)["input_ids"][0] if stop_words else None
    )

    prompt_tokens = mx.array(tokenizer.encode(prompt))
    prompt_token_len = len(prompt_tokens)
    detokenizer = tokenizer.detokenizer

    detokenizer.reset()

    for (token, logprobs), n in zip(
        generate_step(prompt_tokens, model, **kwargs),
        range(max_tokens),
    ):
        if token == tokenizer.eos_token_id or (
            stop_words_id and token in stop_words_id
        ):
            break

        detokenizer.add_token(token)

    detokenizer.finalize()

    _completion_tokens = len(detokenizer.tokens)
    token_length_info: Usage = Usage(
        prompt_tokens=prompt_token_len,
        completion_tokens=_completion_tokens,
        total_tokens=prompt_token_len + _completion_tokens,
    )
    return detokenizer.text, token_length_info

`remove_model(model_name)` `async` ¶

Remove a model from the API.

Parameters:

Name	Type	Description	Default
`model_name`	`str`	The name of the model to remove.	required

Returns:

Name	Type	Description
`Response`	`str`	A 204 No Content response if successful.

Raises:

Type	Description
`HTTPException(str)`	If the model is not found.

Source code in fastmlx/fastmlx.py

@app.delete("/v1/models")
async def remove_model(model_name: str):
    """
    Remove a model from the API.

    Args:
        model_name (str): The name of the model to remove.

    Returns:
        Response (str): A 204 No Content response if successful.

    Raises:
        HTTPException (str): If the model is not found.
    """
    model_name = unquote(model_name).strip('"')
    removed = await model_provider.remove_model(model_name)
    if removed:
        return Response(status_code=204)  # 204 No Content - successful deletion
    else:
        raise HTTPException(status_code=404, detail=f"Model '{model_name}' not found")

Endpoints¶

add_model(model_name) async ¶

chat_completion(request) async ¶

get_supported_models() async ¶

list_models() async ¶

lm_generate(model, tokenizer, prompt, max_tokens=100, **kwargs) ¶

remove_model(model_name) async ¶

`add_model(model_name)` `async` ¶

`chat_completion(request)` `async` ¶

`get_supported_models()` `async` ¶

`list_models()` `async` ¶

`lm_generate(model, tokenizer, prompt, max_tokens=100, **kwargs)` ¶

`remove_model(model_name)` `async` ¶