AI Inference
Run chat completions, text generation, and embeddings through the Strongly AI gateway.
Overview
The AI Inference resource provides:
- Chat completions with multi-turn conversation support
- Text completions from a prompt
- Streaming responses for real-time token delivery
- Text embeddings for similarity search and RAG applications
- Unified access to all configured AI models
All inference methods are accessed through client.ai.inference.
Chat Completion
Generate a response from a conversation history:
from strongly import Strongly
client = Strongly()
response = client.ai.inference.chat_completion(
model="gpt-4o-mini",
messages=[
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "What is gradient descent?"},
],
)
print(response.content)
print(f"Tokens used: {response.usage.total_tokens}")
Multi-Turn Conversations
Pass the full conversation history to maintain context:
from strongly import Strongly
client = Strongly()
messages = [
{"role": "system", "content": "You are a Python expert."},
{"role": "user", "content": "What are decorators?"},
]
response = client.ai.inference.chat_completion(
model="gpt-4o-mini",
messages=messages,
)
print(f"Assistant: {response.content}")
# Continue the conversation
messages.append({"role": "assistant", "content": response.content})
messages.append({"role": "user", "content": "Show me an example."})
response = client.ai.inference.chat_completion(
model="gpt-4o-mini",
messages=messages,
)
print(f"Assistant: {response.content}")
Generation Parameters
Control the output with temperature, token limits, and stop sequences:
from strongly import Strongly
client = Strongly()
response = client.ai.inference.chat_completion(
model="gpt-4o-mini",
messages=[{"role": "user", "content": "List three benefits of unit testing."}],
temperature=0.3, # Lower = more deterministic
max_tokens=256, # Cap the response length
top_p=0.9, # Nucleus sampling
stop=["\n\n", "---"], # Stop generating at these sequences
)
print(response.content)
Streaming
Stream tokens as they are generated for a responsive user experience:
from strongly import Strongly
client = Strongly()
print("Response: ", end="", flush=True)
for chunk in client.ai.inference.chat_completion(
model="gpt-4o-mini",
messages=[{"role": "user", "content": "Write a haiku about machine learning."}],
stream=True,
):
print(chunk.content, end="", flush=True)
print()
Collecting a Streamed Response
from strongly import Strongly
client = Strongly()
chunks = []
for chunk in client.ai.inference.chat_completion(
model="gpt-4o-mini",
messages=[{"role": "user", "content": "Explain recursion briefly."}],
stream=True,
):
chunks.append(chunk.content)
full_response = "".join(chunks)
print(full_response)
Text Completion
Generate text from a single prompt (no conversation history):
from strongly import Strongly
client = Strongly()
response = client.ai.inference.completion(
model="gpt-4o-mini",
prompt="Explain the difference between a list and a tuple in Python:",
max_tokens=200,
temperature=0.5,
)
print(response.content)
print(f"Tokens: {response.usage.total_tokens}")
Streaming Text Completion
from strongly import Strongly
client = Strongly()
for chunk in client.ai.inference.completion(
model="gpt-4o-mini",
prompt="Write a short poem about data pipelines:",
stream=True,
):
print(chunk.content, end="", flush=True)
print()
Embeddings
Generate vector embeddings for similarity search, clustering, and RAG applications:
from strongly import Strongly
client = Strongly()
response = client.ai.inference.embedding(
model="text-embedding-ada-002",
input="Machine learning is a subset of artificial intelligence.",
)
print(f"Embedding dimension: {len(response.embeddings[0])}")
Batch Embeddings
Pass a list of strings to embed multiple texts in a single call:
from strongly import Strongly
client = Strongly()
texts = [
"Machine learning uses statistical methods to learn from data.",
"Deep learning is based on artificial neural networks.",
"The weather forecast calls for rain tomorrow.",
]
response = client.ai.inference.embedding(
model="text-embedding-ada-002",
input=texts,
)
print(f"Generated {len(response.embeddings)} embeddings")
print(f"Dimension: {len(response.embeddings[0])}")
Similarity Search Example
import math
from strongly import Strongly
client = Strongly()
def cosine_similarity(a, b):
dot = sum(x * y for x, y in zip(a, b))
norm_a = math.sqrt(sum(x * x for x in a))
norm_b = math.sqrt(sum(x * x for x in b))
return dot / (norm_a * norm_b)
texts = [
"Machine learning is a subset of artificial intelligence.",
"Deep learning uses neural networks with many layers.",
"The weather today is sunny and warm.",
]
response = client.ai.inference.embedding(
model="text-embedding-ada-002",
input=texts,
)
sim_01 = cosine_similarity(response.embeddings[0], response.embeddings[1])
sim_02 = cosine_similarity(response.embeddings[0], response.embeddings[2])
print(f"ML vs Deep Learning: {sim_01:.4f}") # High similarity
print(f"ML vs Weather: {sim_02:.4f}") # Low similarity
Parameters
chat_completion
def chat_completion(
self,
*,
model: str,
messages: List[Union[Dict[str, Any], ChatMessage]],
stream: bool = False,
max_tokens: Optional[int] = None,
temperature: float = 0.7,
top_p: float = 1.0,
stop: Optional[Union[str, List[str]]] = None,
**kwargs: Any,
) -> Union[ChatCompletion, Iterator[StreamChunk]]
| Parameter | Type | Default | Description |
|---|---|---|---|
model | str | required | Model identifier (e.g., "gpt-4o-mini") |
messages | List[Union[Dict, ChatMessage]] | required | Conversation messages with role and content |
stream | bool | False | Return an iterator of StreamChunk objects |
max_tokens | int | None | Maximum tokens to generate |
temperature | float | 0.7 | Sampling temperature (0.0 to 2.0) |
top_p | float | 1.0 | Nucleus sampling threshold |
stop | str | List[str] | None | Stop sequence(s) |
**kwargs | Any | Additional model-specific parameters |
completion
def completion(
self,
*,
model: str,
prompt: str,
stream: bool = False,
max_tokens: Optional[int] = None,
temperature: float = 0.7,
**kwargs: Any,
) -> Union[Completion, Iterator[StreamChunk]]
| Parameter | Type | Default | Description |
|---|---|---|---|
model | str | required | Model identifier |
prompt | str | required | Text prompt |
stream | bool | False | Return an iterator of StreamChunk objects |
max_tokens | int | None | Maximum tokens to generate |
temperature | float | 0.7 | Sampling temperature |
**kwargs | Any | Additional model-specific parameters |
embedding
def embedding(
self,
*,
model: str,
input: Union[str, List[str]],
**kwargs: Any,
) -> EmbeddingResponse
| Parameter | Type | Default | Description |
|---|---|---|---|
model | str | required | Embedding model identifier (e.g., "text-embedding-ada-002") |
input | str | List[str] | required | Text or list of texts to embed |
**kwargs | Any | Additional model-specific parameters |
Response Models
ChatCompletion
| Field | Type | Description |
|---|---|---|
id | str | Unique completion identifier |
model | str | Model used for generation |
choices | List[ChatCompletionChoice] | List of generated choices |
usage | TokenUsage | Token usage breakdown |
created | int | Unix timestamp of creation |
Property: content -- Returns the first choice's message content (choices[0].message.content).
Completion
| Field | Type | Description |
|---|---|---|
id | str | Unique completion identifier |
model | str | Model used for generation |
choices | List[ChatCompletionChoice] | List of generated choices |
usage | TokenUsage | Token usage breakdown |
created | int | Unix timestamp of creation |
Property: content -- Returns the first choice's message content.
ChatCompletionChoice
| Field | Type | Description |
|---|---|---|
index | int | Choice index |
message | ChatMessage | Generated message |
finish_reason | str | Reason generation stopped (stop, length, etc.) |
ChatMessage
| Field | Type | Description |
|---|---|---|
role | str | Message role (system, user, assistant, function) |
content | str | Message text content |
name | str | Optional sender name |
function_call | dict | Function call details (if applicable) |
tool_calls | list | Tool call details (if applicable) |
TokenUsage
| Field | Type | Description |
|---|---|---|
prompt_tokens | int | Tokens in the prompt |
completion_tokens | int | Tokens in the generated response |
total_tokens | int | Total tokens consumed |
StreamChunk
| Field | Type | Description |
|---|---|---|
id | str | Chunk identifier |
model | str | Model used for generation |
content | str | Generated text fragment |
finish_reason | str | Set on the final chunk |
role | str | Message role |
index | int | Choice index |
EmbeddingResponse
| Field | Type | Description |
|---|---|---|
model | str | Embedding model used |
data | List[EmbeddingData] | List of embedding objects |
usage | TokenUsage | Token usage breakdown |
Property: embeddings -- Returns a flat List[List[float]] of embedding vectors.
EmbeddingData
| Field | Type | Description |
|---|---|---|
embedding | List[float] | The embedding vector |
index | int | Position in the input list |
Complete Example
import math
from strongly import Strongly
def main():
client = Strongly()
# --- Chat Completion ---
print("--- Chat Completion ---")
response = client.ai.inference.chat_completion(
model="gpt-4o-mini",
messages=[
{"role": "system", "content": "You are a data science tutor."},
{"role": "user", "content": "Explain overfitting in one paragraph."},
],
temperature=0.5,
max_tokens=200,
)
print(response.content)
print(f"Tokens used: {response.usage.total_tokens}\n")
# --- Streaming Chat ---
print("--- Streaming Chat ---")
print("Response: ", end="", flush=True)
for chunk in client.ai.inference.chat_completion(
model="gpt-4o-mini",
messages=[{"role": "user", "content": "Write a haiku about neural networks."}],
stream=True,
):
print(chunk.content, end="", flush=True)
print("\n")
# --- Text Completion ---
print("--- Text Completion ---")
response = client.ai.inference.completion(
model="gpt-4o-mini",
prompt="Three benefits of automated testing:",
max_tokens=150,
temperature=0.3,
)
print(response.content)
print(f"Tokens used: {response.usage.total_tokens}\n")
# --- Embeddings ---
print("--- Embeddings ---")
texts = [
"Machine learning is a subset of artificial intelligence.",
"Deep learning uses neural networks with many layers.",
"The weather today is sunny and warm.",
]
response = client.ai.inference.embedding(
model="text-embedding-ada-002",
input=texts,
)
print(f"Generated {len(response.embeddings)} embeddings")
print(f"Dimension: {len(response.embeddings[0])}")
def cosine_similarity(a, b):
dot = sum(x * y for x, y in zip(a, b))
norm_a = math.sqrt(sum(x * x for x in a))
norm_b = math.sqrt(sum(x * x for x in b))
return dot / (norm_a * norm_b)
sim_01 = cosine_similarity(response.embeddings[0], response.embeddings[1])
sim_02 = cosine_similarity(response.embeddings[0], response.embeddings[2])
print(f"ML vs Deep Learning: {sim_01:.4f}")
print(f"ML vs Weather: {sim_02:.4f}")
# --- Multi-Turn Conversation ---
print("\n--- Multi-Turn Conversation ---")
messages = [
{"role": "system", "content": "You are a helpful AI assistant."},
{"role": "user", "content": "What is the difference between SQL and NoSQL?"},
]
response = client.ai.inference.chat_completion(
model="gpt-4o-mini",
messages=messages,
)
print(f"Assistant: {response.content[:200]}...")
messages.append({"role": "assistant", "content": response.content})
messages.append({"role": "user", "content": "When should I choose NoSQL?"})
response = client.ai.inference.chat_completion(
model="gpt-4o-mini",
messages=messages,
)
print(f"Assistant: {response.content[:200]}...")
if __name__ == "__main__":
main()