typescriptintermediate

Ollama Local LLM Inference

Run local LLM inference using Ollama REST API with streaming and model management.

typescriptPress ⌘/Ctrl + Shift + C to copy

// Ollama local inference
async function chatWithOllama(
  prompt: string,
  model = 'llama3.1'
): Promise<string> {
  const response = await fetch('http://localhost:11434/api/generate', {
    method: 'POST',
    headers: { 'Content-Type': 'application/json' },
    body: JSON.stringify({ model, prompt, stream: false }),
  });
  const data = await response.json();
  return data.response;
}

// Streaming response
async function* streamOllama(
  prompt: string,
  model = 'llama3.1'
): AsyncGenerator<string> {
  const response = await fetch('http://localhost:11434/api/generate', {
    method: 'POST',
    headers: { 'Content-Type': 'application/json' },
    body: JSON.stringify({ model, prompt, stream: true }),
  });

  const reader = response.body!.getReader();
  const decoder = new TextDecoder();

  while (true) {
    const { done, value } = await reader.read();
    if (done) break;
    const chunk = decoder.decode(value);
    for (const line of chunk.split('\n').filter(Boolean)) {
      const json = JSON.parse(line);
      if (json.response) yield json.response;
    }
  }
}

// Chat with conversation history
async function chatConversation(
  messages: { role: string; content: string }[],
  model = 'llama3.1'
) {
  const response = await fetch('http://localhost:11434/api/chat', {
    method: 'POST',
    headers: { 'Content-Type': 'application/json' },
    body: JSON.stringify({ model, messages, stream: false }),
  });
  return response.json();
}

const result = await chatWithOllama('Explain closures in JavaScript');