Streaming guide
Streaming lets you receive tokens as they are generated instead of waiting for the full response. This guide covers how streaming works, how to handle it in Python and Node.js, and patterns for production use.
How it works
When you set stream: true (or stream=True), the API switches from a single JSON response to a stream of Server-Sent Events (SSE). Each event contains one or more tokens as they come off the GPU.
The wire format looks like this:
HTTP/1.1 200 OK
Content-Type: text/event-stream
data: {"id":"chatcmpl-abc","choices":[{"delta":{"role":"assistant","content":""},"index":0}]}
data: {"id":"chatcmpl-abc","choices":[{"delta":{"content":"Hello"},"index":0}]}
data: {"id":"chatcmpl-abc","choices":[{"delta":{"content":"!"},"index":0}]}
data: {"id":"chatcmpl-abc","choices":[{"delta":{"content":" How"},"index":0}],"usage":{"prompt_tokens":10,"completion_tokens":3,"total_tokens":13}}
data: [DONE]The OpenAI SDK abstracts SSE parsing for you — you iterate over chunks and extract choices[0].delta.content from each one.data: [DONE] signals the end of the stream.
Basic streaming
import os
from openai import OpenAI
client = OpenAI(
base_url="https://api.cloudach.com/v1",
api_key=os.environ["CLOUDACH_API_KEY"],
)
stream = client.chat.completions.create(
model="llama3-8b",
messages=[{"role": "user", "content": "Write a poem about the ocean."}],
stream=True,
)
for chunk in stream:
delta = chunk.choices[0].delta.content
if delta:
print(delta, end="", flush=True)
print() # newline after stream endsCollecting the full response
Often you want to both stream tokens to the UI and capture the complete text when done.
chunks = []
stream = client.chat.completions.create(
model="llama3-8b",
messages=[{"role": "user", "content": "Explain recursion briefly."}],
stream=True,
)
for chunk in stream:
delta = chunk.choices[0].delta.content or ""
chunks.append(delta)
print(delta, end="", flush=True)
full_text = "".join(chunks)
print(f"\n\nTotal characters: {len(full_text)}")Error handling
Streaming errors fall into two categories:
- Pre-stream — request rejected before any data is sent (e.g. 401, 429). You get a normal HTTP error response.
- Mid-stream — backend fails after the stream starts. The SDK surfaces this as an exception during iteration.
import time
from openai import OpenAI, APIStatusError, APIConnectionError
client = OpenAI(
base_url="https://api.cloudach.com/v1",
api_key=os.environ["CLOUDACH_API_KEY"],
)
RETRYABLE = {429, 500, 502, 503}
def stream_with_retry(messages, model="llama3-8b", max_retries=3):
for attempt in range(max_retries):
try:
stream = client.chat.completions.create(
model=model,
messages=messages,
stream=True,
)
collected = []
for chunk in stream:
delta = chunk.choices[0].delta.content or ""
collected.append(delta)
print(delta, end="", flush=True)
print()
return "".join(collected)
except APIStatusError as e:
if e.status_code not in RETRYABLE or attempt == max_retries - 1:
raise
retry_after = e.response.headers.get("Retry-After")
wait = float(retry_after) if retry_after else 2 ** attempt
print(f"\nRetrying in {wait}s (attempt {attempt + 1})...")
time.sleep(wait)
except APIConnectionError:
# TCP reset or proxy timeout — retry immediately
if attempt == max_retries - 1:
raise
print(f"\nConnection error, retrying...")
result = stream_with_retry([{"role": "user", "content": "Hello!"}])Async Python (asyncio)
Use AsyncOpenAI for async frameworks like FastAPI, aiohttp, or raw asyncio:
import asyncio
import os
from openai import AsyncOpenAI
client = AsyncOpenAI(
base_url="https://api.cloudach.com/v1",
api_key=os.environ["CLOUDACH_API_KEY"],
)
async def stream_response(prompt: str) -> str:
stream = await client.chat.completions.create(
model="llama3-8b",
messages=[{"role": "user", "content": prompt}],
stream=True,
)
chunks = []
async for chunk in stream:
delta = chunk.choices[0].delta.content or ""
chunks.append(delta)
print(delta, end="", flush=True)
print()
return "".join(chunks)
# FastAPI example
from fastapi import FastAPI
from fastapi.responses import StreamingResponse
app = FastAPI()
@app.get("/chat")
async def chat(prompt: str):
async def generate():
stream = await client.chat.completions.create(
model="llama3-8b",
messages=[{"role": "user", "content": prompt}],
stream=True,
)
async for chunk in stream:
delta = chunk.choices[0].delta.content or ""
if delta:
yield delta
return StreamingResponse(generate(), media_type="text/plain")Node.js — streaming to an HTTP response
To stream tokens directly to an HTTP client (e.g. in Next.js API routes or Express):
// Next.js App Router — route.ts
import OpenAI from "openai";
import { NextRequest } from "next/server";
const client = new OpenAI({
baseURL: "https://api.cloudach.com/v1",
apiKey: process.env.CLOUDACH_API_KEY!,
});
export async function POST(req: NextRequest) {
const { prompt } = await req.json();
const stream = await client.chat.completions.create({
model: "llama3-8b",
messages: [{ role: "user", content: prompt }],
stream: true,
});
// Pipe Cloudach SSE directly to the browser
const encoder = new TextEncoder();
const readable = new ReadableStream({
async start(controller) {
for await (const chunk of stream) {
const delta = chunk.choices[0]?.delta?.content ?? "";
if (delta) controller.enqueue(encoder.encode(delta));
}
controller.close();
},
});
return new Response(readable, {
headers: { "Content-Type": "text/plain; charset=utf-8" },
});
}Raw cURL / SSE
You can consume the raw SSE stream with curl:
curl https://api.cloudach.com/v1/chat/completions \
-H "Authorization: Bearer $CLOUDACH_API_KEY" \
-H "Content-Type: application/json" \
-d '{
"model": "llama3-8b",
"messages": [{"role": "user", "content": "Count from 1 to 5 slowly."}],
"stream": true
}'Output (raw SSE):
data: {"id":"chatcmpl-abc","choices":[{"delta":{"role":"assistant","content":""},"index":0}]}
data: {"id":"chatcmpl-abc","choices":[{"delta":{"content":"1"},"index":0}]}
data: {"id":"chatcmpl-abc","choices":[{"delta":{"content":", 2"},"index":0}]}
data: {"id":"chatcmpl-abc","choices":[{"delta":{"content":", 3, 4, 5."},"index":0}]}
data: [DONE]UI patterns
Common patterns for chat UIs:
Append tokens to state (React)
// React hook for streaming chat
import { useState } from "react";
export function useCloudachStream() {
const [response, setResponse] = useState("");
const [loading, setLoading] = useState(false);
async function send(prompt) {
setLoading(true);
setResponse("");
// Stream via your own API route to keep the key server-side
const res = await fetch("/api/chat", {
method: "POST",
headers: { "Content-Type": "application/json" },
body: JSON.stringify({ prompt }),
});
const reader = res.body.getReader();
const decoder = new TextDecoder();
while (true) {
const { value, done } = await reader.read();
if (done) break;
setResponse((prev) => prev + decoder.decode(value));
}
setLoading(false);
}
return { response, loading, send };
}