Python API
Note: this API is in preview and is subject to change.
- Install and import
- Model class
- Config class
- GeneratorParams class
- Generator class
- Tokenizer class
- TokenizerStream class
- NamedTensors class
- Tensor class
- Adapters class
- MultiModalProcessor class
- Images class
- Audios class
- Utility functions
Install and import
The Python API is delivered by the onnxruntime-genai Python package.
pip install onnxruntime-genai
import onnxruntime_genai
Model class
Load a model
onnxruntime_genai.Model(config_path: str) -> Model
onnxruntime_genai.Model(config: onnxruntime_genai.Config) -> Model
Properties
-
type: Returns the model type as a string.model = onnxruntime_genai.Model("config.json") print(model.type) -
device_type: Returns the device type as a string.print(model.device_type)
Methods
-
create_multimodal_processor() -> MultiModalProcessorprocessor = model.create_multimodal_processor()
Config class
onnxruntime_genai.Config(config_path: str) -> Config
Methods
-
append_provider(provider: str)config = onnxruntime_genai.Config("config.json") config.append_provider("CUDAExecutionProvider") -
set_provider_option(option: str, value: str)config.set_provider_option("device_id", "0") -
clear_providers()config.clear_providers()
GeneratorParams class
onnxruntime_genai.GeneratorParams(model: Model) -> GeneratorParams
Methods
-
set_inputs(named_tensors: NamedTensors)params = onnxruntime_genai.GeneratorParams(model) named_tensors = onnxruntime_genai.NamedTensors() params.set_inputs(named_tensors) -
set_model_input(name: str, value: numpy.ndarray)import numpy as np params.set_model_input("input_ids", np.array([1, 2, 3], dtype=np.int32)) -
try_graph_capture_with_max_batch_size(max_batch_size: int)params.try_graph_capture_with_max_batch_size(8) -
set_search_options(**options)params.set_search_options(temperature=0.7, top_p=0.9) -
set_guidance(type: str, data: str)params.set_guidance("prefix", "Once upon a time")
Generator class
onnxruntime_genai.Generator(model: Model, params: GeneratorParams) -> Generator
Methods
-
is_done() -> boolgenerator = onnxruntime_genai.Generator(model, params) done = generator.is_done() -
get_output(name: str) -> numpy.ndarrayoutput = generator.get_output("output_ids") -
append_tokens(tokens: numpy.ndarray[int32])generator.append_tokens(np.array([4, 5], dtype=np.int32)) -
append_tokens(tokens: onnxruntime_genai.Tensor)tensor = onnxruntime_genai.Tensor(np.array([4, 5], dtype=np.int32)) generator.append_tokens(tensor) -
get_logits() -> numpy.ndarray[float32]logits = generator.get_logits() -
set_logits(new_logits: numpy.ndarray[float32])generator.set_logits(np.zeros_like(logits)) -
generate_next_token()generator.generate_next_token() -
rewind_to(new_length: int)generator.rewind_to(2) -
get_next_tokens() -> numpy.ndarray[int32]next_tokens = generator.get_next_tokens() -
get_sequence(index: int) -> numpy.ndarray[int32]sequence = generator.get_sequence(0) -
set_active_adapter(adapters: onnxruntime_genai.Adapters, adapter_name: str)adapters = onnxruntime_genai.Adapters(model) generator.set_active_adapter(adapters, "adapter_name")
Tokenizer class
onnxruntime_genai.Tokenizer(model: Model) -> Tokenizer
Methods
-
encode(text: str) -> numpy.ndarray[int32]tokenizer = onnxruntime_genai.Tokenizer(model) tokens = tokenizer.encode("Hello world") -
to_token_id(text: str) -> inttoken_id = tokenizer.to_token_id("Hello") -
decode(tokens: numpy.ndarray[int32]) -> strtext = tokenizer.decode(tokens) -
apply_chat_template(template_str: str, messages: str, tools: str = None, add_generation_prompt: bool = False) -> strchat = tokenizer.apply_chat_template("{user}: {message}", messages="Hi!", add_generation_prompt=True) -
encode_batch(texts: list[str]) -> onnxruntime_genai.Tensorbatch_tensor = tokenizer.encode_batch(["Hello", "World"]) -
decode_batch(tokens: onnxruntime_genai.Tensor) -> list[str]texts = tokenizer.decode_batch(batch_tensor) -
create_stream() -> TokenizerStreamstream = tokenizer.create_stream()
TokenizerStream class
onnxruntime_genai.TokenizerStream(tokenizer: Tokenizer) -> TokenizerStream
Methods
-
decode(token: int32) -> strtoken_str = stream.decode(123)
NamedTensors class
onnxruntime_genai.NamedTensors() -> NamedTensors
Methods
-
__getitem__(name: str) -> onnxruntime_genai.Tensortensor = named_tensors["input_ids"] -
__setitem__(name: str, value: numpy.ndarray or onnxruntime_genai.Tensor)named_tensors["input_ids"] = np.array([1, 2, 3], dtype=np.int32) -
__contains__(name: str) -> boolexists = "input_ids" in named_tensors -
__delitem__(name: str)del named_tensors["input_ids"] -
__len__() -> intlength = len(named_tensors) -
keys() -> list[str]keys = named_tensors.keys()
Tensor class
onnxruntime_genai.Tensor(array: numpy.ndarray) -> Tensor
Methods
-
shape() -> list[int]tensor = onnxruntime_genai.Tensor(np.array([1, 2, 3])) print(tensor.shape()) -
type() -> intprint(tensor.type()) -
data() -> memoryviewdata = tensor.data() -
as_numpy() -> numpy.ndarrayarr = tensor.as_numpy()
Adapters class
onnxruntime_genai.Adapters(model: Model) -> Adapters
Methods
-
unload(adapter_name: str)adapters.unload("adapter_name") -
load(file: str, name: str)adapters.load("adapter_file.bin", "adapter_name")
MultiModalProcessor class
onnxruntime_genai.MultiModalProcessor(model: Model) -> MultiModalProcessor
Methods
-
__call__(prompt: str = None, images: Images = None, audios: Audios = None) -> onnxruntime_genai.Tensorresult = processor(prompt="Describe this image", images=onnxruntime_genai.Images.open("image.png")) -
create_stream() -> TokenizerStreamstream = processor.create_stream() -
decode(tokens: numpy.ndarray[int32]) -> strtext = processor.decode(tokens)
Images class
onnxruntime_genai.Images.open(*image_paths: str) -> Images
onnxruntime_genai.Images.open_bytes(*image_datas: bytes) -> Images
images = onnxruntime_genai.Images.open("image1.png", "image2.jpg")
with open("image1.png", "rb") as f:
images_bytes = onnxruntime_genai.Images.open_bytes(f.read())
Audios class
onnxruntime_genai.Audios.open(*audio_paths: str) -> Audios
onnxruntime_genai.Audios.open_bytes(*audio_datas: bytes) -> Audios
audios = onnxruntime_genai.Audios.open("audio1.wav")
with open("audio1.wav", "rb") as f:
audios_bytes = onnxruntime_genai.Audios.open_bytes(f.read())
Utility functions
-
onnxruntime_genai.set_log_options(**options)onnxruntime_genai.set_log_options(verbose=True) -
onnxruntime_genai.is_cuda_available() -> boolprint(onnxruntime_genai.is_cuda_available()) -
onnxruntime_genai.is_dml_available() -> boolprint(onnxruntime_genai.is_dml_available()) -
onnxruntime_genai.is_rocm_available() -> boolprint(onnxruntime_genai.is_rocm_available()) -
onnxruntime_genai.is_webgpu_available() -> boolprint(onnxruntime_genai.is_webgpu_available()) -
onnxruntime_genai.is_qnn_available() -> boolprint(onnxruntime_genai.is_qnn_available()) -
onnxruntime_genai.is_openvino_available() -> boolprint(onnxruntime_genai.is_openvino_available()) -
onnxruntime_genai.set_current_gpu_device_id(device_id: int)onnxruntime_genai.set_current_gpu_device_id(0) -
onnxruntime_genai.get_current_gpu_device_id() -> intprint(onnxruntime_genai.get_current_gpu_device_id())