Spaces:

cfahlgren1
/

inference-proxy

Sleeping

App Files Files Community

cfahlgren1 HF Staff commited on Apr 7

Commit

fa01d80

0 Parent(s):

add basic inference proxy

Browse files

Files changed (7) hide show

.gitignore +175 -0
README.md +48 -0
bun.lockb +0 -0
example.ts +28 -0
index.ts +501 -0
package.json +18 -0
tsconfig.json +27 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,175 @@

+# Based on https://raw.githubusercontent.com/github/gitignore/main/Node.gitignore
+# Logs
+logs
+_.log
+npm-debug.log_
+yarn-debug.log*
+yarn-error.log*
+lerna-debug.log*
+.pnpm-debug.log*
+# Caches
+.cache
+# Diagnostic reports (https://nodejs.org/api/report.html)
+report.[0-9]_.[0-9]_.[0-9]_.[0-9]_.json
+# Runtime data
+pids
+_.pid
+_.seed
+*.pid.lock
+# Directory for instrumented libs generated by jscoverage/JSCover
+lib-cov
+# Coverage directory used by tools like istanbul
+coverage
+*.lcov
+# nyc test coverage
+.nyc_output
+# Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files)
+.grunt
+# Bower dependency directory (https://bower.io/)
+bower_components
+# node-waf configuration
+.lock-wscript
+# Compiled binary addons (https://nodejs.org/api/addons.html)
+build/Release
+# Dependency directories
+node_modules/
+jspm_packages/
+# Snowpack dependency directory (https://snowpack.dev/)
+web_modules/
+# TypeScript cache
+*.tsbuildinfo
+# Optional npm cache directory
+.npm
+# Optional eslint cache
+.eslintcache
+# Optional stylelint cache
+.stylelintcache
+# Microbundle cache
+.rpt2_cache/
+.rts2_cache_cjs/
+.rts2_cache_es/
+.rts2_cache_umd/
+# Optional REPL history
+.node_repl_history
+# Output of 'npm pack'
+*.tgz
+# Yarn Integrity file
+.yarn-integrity
+# dotenv environment variable files
+.env
+.env.development.local
+.env.test.local
+.env.production.local
+.env.local
+# parcel-bundler cache (https://parceljs.org/)
+.parcel-cache
+# Next.js build output
+.next
+out
+# Nuxt.js build / generate output
+.nuxt
+dist
+# Gatsby files
+# Comment in the public line in if your project uses Gatsby and not Next.js
+# https://nextjs.org/blog/next-9-1#public-directory-support
+# public
+# vuepress build output
+.vuepress/dist
+# vuepress v2.x temp and cache directory
+.temp
+# Docusaurus cache and generated files
+.docusaurus
+# Serverless directories
+.serverless/
+# FuseBox cache
+.fusebox/
+# DynamoDB Local files
+.dynamodb/
+# TernJS port file
+.tern-port
+# Stores VSCode versions used for testing VSCode extensions
+.vscode-test
+# yarn v2
+.yarn/cache
+.yarn/unplugged
+.yarn/build-state.yml
+.yarn/install-state.gz
+.pnp.*
+# IntelliJ based IDEs
+.idea
+# Finder (MacOS) folder config
+.DS_Store

README.md ADDED Viewed

	@@ -0,0 +1,48 @@

+# inference-proxy
+Lightweight proxy to store LLM traces in a Hugging Face Dataset.
+### How it works
+This API acts as a proxy for OpenAPI endpoints. You can specify a couple of variables:
+- `BATCH_SIZE_LIMIT` - the maximum batch size before pushing to dataset
+- `BATCH_TIME_LIMIT` - the amount of time before pushing to dataset
+### Required Environment Variables
+- `HF_ACCESS_TOKEN` - HF Access Token
+- `USER_NAME` - Used to ensure we only process requests from the user
+### Example
+```js
+import { OpenAI } from "openai";
+const client = new OpenAI({
+	baseURL: "http://localhost:4040/fireworks-ai/inference/v1",
+	apiKey: process.env.HF_API_KEY,
+});
+let out = "";
+const stream = await client.chat.completions.create({
+    model: "accounts/fireworks/models/deepseek-v3",
+    messages: [
+        {
+            role: "user",
+            content: "What is the capital of France?",
+        },
+    ],
+    stream: true,
+    max_tokens: 500,
+});
+for await (const chunk of stream) {
+	if (chunk.choices && chunk.choices.length > 0) {
+		const newContent = chunk.choices[0].delta.content;
+		out += newContent;
+		console.log(newContent);
+	}
+}
+```

bun.lockb ADDED Viewed

Binary file (19.4 kB). View file

example.ts ADDED Viewed

	@@ -0,0 +1,28 @@

+import { OpenAI } from "openai";
+const client = new OpenAI({
+	baseURL: "http://localhost:4040/fireworks-ai/inference/v1",
+	apiKey: process.env.HF_API_KEY,
+});
+let out = "";
+const stream = await client.chat.completions.create({
+    model: "accounts/fireworks/models/deepseek-v3",
+    messages: [
+        {
+            role: "user",
+            content: "What is the capital of France?",
+        },
+    ],
+    stream: true,
+    max_tokens: 500,
+});
+for await (const chunk of stream) {
+	if (chunk.choices && chunk.choices.length > 0) {
+		const newContent = chunk.choices[0].delta.content;
+		out += newContent;
+		console.log(newContent);
+	}
+}

index.ts ADDED Viewed

	@@ -0,0 +1,501 @@

+import { Hono } from 'hono';
+import { stream } from 'hono/streaming';
+import type { StatusCode } from 'hono/utils/http-status';
+import { existsSync, mkdirSync, writeFileSync, readFileSync, unlinkSync } from 'fs';
+import { join, basename, resolve } from 'path';
+import { uploadFiles, checkRepoAccess, createRepo,whoAmI } from '@huggingface/hub';
+import type { RepoDesignation } from '@huggingface/hub';
+const app = new Hono();
+const TARGET_BASE_URL = process.env.TARGET_BASE_URL || "https://router.huggingface.co";
+const PORT = parseInt(process.env.PORT || '4040', 10);
+const LOGS_DIR = process.env.LOGS_DIR || './logs';
+const HF_ACCESS_TOKEN = process.env.HF_API_KEY || '';
+const DATASET_PRIVATE = (process.env.DATASET_PRIVATE || 'false').toLowerCase() === 'false';
+/*
+    USER_NAME - the name of the user to use for the dataset
+    This will be used to invalidate requests that are not from the user
+*/
+const USER_NAME = process.env.USER_NAME || 'cfahlgren1';
+if (!HF_ACCESS_TOKEN) {
+  console.error('Please set HF_API_KEY in environment variable');
+  process.exit(1);
+}
+if (!USER_NAME) {
+  console.error('Please set USER_NAME in environment variable');
+  process.exit(1);
+}
+/*
+    BATCH_SIZE_LIMIT - the maximum batch size before pushing to dataset
+    BATCH_TIME_LIMIT - the amount of time before pushing to dataset
+    We will push to dataset for whatever is reached first.
+*/
+const BATCH_SIZE_LIMIT = parseInt(process.env.BATCH_SIZE_LIMIT || '100', 10);
+const BATCH_TIME_LIMIT = parseInt(process.env.BATCH_TIME_LIMIT || '1', 10); // 1 minute default
+if (!existsSync(LOGS_DIR)) {
+  mkdirSync(LOGS_DIR, { recursive: true });
+}
+async function checkUserAccess(username: string): Promise<boolean> {
+  const response = await whoAmI({ accessToken: HF_ACCESS_TOKEN });
+  return response.name === username;
+}
+const requestTraces: {
+  model?: string;
+  timestamp_start: string;
+  timestamp_end?: string;
+  messages?: any[];
+  prompt_tokens?: number;
+  completion_tokens?: number;
+  response?: string;
+  arguments?: any;
+  provider?: string;
+  duration_ms?: number;
+}[] = [];
+let lastTraceBatchTime = Date.now();
+async function checkDatasetExists(datasetName: string): Promise<boolean> {
+  try {
+    if (!HF_ACCESS_TOKEN) {
+      console.warn('HF_ACCESS_TOKEN not set, skipping dataset check');
+      return false;
+    }
+    const repo: RepoDesignation = { type: 'dataset', name: datasetName };
+    await checkRepoAccess({ repo, accessToken: HF_ACCESS_TOKEN });
+    return true;
+  } catch (error) {
+    return false;
+  }
+}
+async function createDataset(datasetName: string): Promise<boolean> {
+  try {
+    if (!HF_ACCESS_TOKEN) {
+      console.warn('HF_ACCESS_TOKEN not set, skipping dataset creation');
+      return false;
+    }
+    const repo: RepoDesignation = { type: 'dataset', name: datasetName };
+    await createRepo({
+      repo,
+      accessToken: HF_ACCESS_TOKEN,
+      private: DATASET_PRIVATE,
+      files: [
+        {
+          path: 'README.md',
+          content: new Blob([`---
+tags:
+- inference-proxy
+---`]),
+        }
+      ]
+    });
+    return true;
+  } catch (error) {
+    console.error('Error creating dataset:', error);
+    return false;
+  }
+}
+function writeTraceToFile(trace: typeof requestTraces[0]): string {
+  try {
+    const timestamp = new Date(trace.timestamp_start).getTime();
+    const model = trace.model || 'unknown';
+    const filename = `${timestamp}_${model.replace(/\//g, '_')}.json`;
+    const filePath = join(LOGS_DIR, filename);
+    writeFileSync(filePath, JSON.stringify(trace, null, 2));
+    return filePath;
+  } catch (error) {
+    console.error('Error writing trace to file:', error);
+    return '';
+  }
+}
+async function uploadTraceFile(filePath: string, datasetName: string): Promise<boolean> {
+  try {
+    if (!HF_ACCESS_TOKEN) {
+      console.warn('HF_ACCESS_TOKEN not set, skipping file upload');
+      return false;
+    }
+    const repo: RepoDesignation = { type: 'dataset', name: datasetName };
+    const fileName = basename(filePath);
+    const uploadPath = `traces/${fileName}`;
+    await uploadFiles({
+      repo,
+      accessToken: HF_ACCESS_TOKEN,
+      files: [
+        {
+          path: uploadPath,
+          content: new Blob([readFileSync(filePath)]),
+        },
+      ],
+    });
+    return true;
+  } catch (error) {
+    console.error('Error uploading trace file:', error);
+    return false;
+  }
+}
+async function writeBatchedTraces() {
+  if (requestTraces.length === 0) {
+    return;
+  }
+  const tracesToWrite = [...requestTraces];
+  const batchSize = tracesToWrite.length;
+  requestTraces.length = 0;
+  lastTraceBatchTime = Date.now();
+  console.log(`Processing batch of ${batchSize} traces...`);
+  // write traces to local files first
+  const filePaths: string[] = [];
+  for (const trace of tracesToWrite) {
+    const filePath = writeTraceToFile(trace);
+    if (filePath) {
+      filePaths.push(filePath);
+    }
+  }
+  if (HF_ACCESS_TOKEN) {
+    const response = await whoAmI({ accessToken: HF_ACCESS_TOKEN });
+    const datasetName = `${response.name}/traces`;
+    // Check if dataset exists, create if not
+    const exists = await checkDatasetExists(datasetName);
+    if (!exists) {
+      console.log(`Dataset ${datasetName} does not exist, creating...`);
+      const created = await createDataset(datasetName);
+      if (!created) {
+        console.error(`Failed to create dataset ${datasetName}`);
+      } else {
+        console.log(`Successfully created dataset ${datasetName}`);
+      }
+    }
+    // Upload files to dataset
+    for (const filePath of filePaths) {
+      const uploaded = await uploadTraceFile(filePath, datasetName);
+      // Clean up local file if uploaded successfully
+      if (uploaded && existsSync(filePath)) {
+        unlinkSync(filePath);
+        console.log(`Deleted local file ${filePath} after successful upload`);
+      }
+    }
+  } else {
+    console.log(`HF_ACCESS_TOKEN or HF_DATASET_OWNER not set, keeping ${filePaths.length} local files`);
+  }
+  console.log(`Successfully processed ${batchSize} traces.`);
+}
+setInterval(() => {
+  const timeSinceLastBatch = Date.now() - lastTraceBatchTime;
+  if (timeSinceLastBatch >= BATCH_TIME_LIMIT * 60 * 1000 && requestTraces.length > 0) {
+    console.log(`Time limit reached (${BATCH_TIME_LIMIT} minutes). Flushing ${requestTraces.length} traces.`);
+    writeBatchedTraces().catch(err => {
+      console.error('Error flushing traces:', err);
+    });
+  }
+}, Math.min(BATCH_TIME_LIMIT * 60 * 1000, 10000)); // Check at least every 10 seconds
+function checkAndFlushTraces() {
+  if (requestTraces.length >= BATCH_SIZE_LIMIT) {
+    console.log(`Batch size limit reached (${BATCH_SIZE_LIMIT}). Flushing traces.`);
+    writeBatchedTraces().catch(err => {
+      console.error('Error flushing traces:', err);
+    });
+    return true;
+  }
+  return false;
+}
+app.get('/', (c) => {
+  return c.text('Hono forwarding proxy running!');
+});
+async function storeStreamedResponse(streamToLog: ReadableStream<Uint8Array>, contentType: string | null, targetUrl: string, traceIndex: number) {
+    const reader = streamToLog.getReader();
+    const chunks: Uint8Array[] = [];
+    try {
+        while (true) {
+            const { done, value } = await reader.read();
+            if (done) break;
+            if (value) chunks.push(value);
+        }
+        const blob = new Blob(chunks);
+        const bodyText = await blob.text();
+        contentType = contentType?.toLowerCase() || '';
+        // Handle event streams (streaming responses)
+        if (contentType.includes('text/event-stream')) {
+            const lines = bodyText.split('\n');
+            let accumulatedContent = '';
+            for (const line of lines) {
+                if (line.startsWith('data: ')) {
+                    const jsonData = line.substring(5).trim();
+                    if (jsonData && jsonData !== '[DONE]') {
+                        try {
+                            const parsed = JSON.parse(jsonData);
+                            if (parsed.choices && parsed.choices[0]?.delta?.content) {
+                                accumulatedContent += parsed.choices[0].delta.content;
+                            }
+                        } catch (parseError) {
+                            // Continue processing other lines
+                        }
+                    }
+                }
+            }
+            if (accumulatedContent) {
+                requestTraces[traceIndex].response = accumulatedContent;
+                requestTraces[traceIndex].completion_tokens = accumulatedContent.length;
+            }
+        }
+        else {
+            try {
+                const jsonResponse = JSON.parse(bodyText);
+                // Get response content from standard LLM response formats
+                requestTraces[traceIndex].response = jsonResponse.choices?.[0]?.message?.content ||
+                                                     jsonResponse.generated_text ||
+                                                     bodyText;
+                // Get token counts if available
+                if (jsonResponse.usage) {
+                    if (jsonResponse.usage.completion_tokens !== undefined) {
+                        requestTraces[traceIndex].completion_tokens = jsonResponse.usage.completion_tokens;
+                    }
+                    if (jsonResponse.usage.prompt_tokens !== undefined) {
+                        requestTraces[traceIndex].prompt_tokens = jsonResponse.usage.prompt_tokens;
+                    }
+                }
+            } catch (e) {
+                // If not JSON, use bodyText as is
+                requestTraces[traceIndex].response = bodyText;
+                requestTraces[traceIndex].completion_tokens = bodyText.length;
+            }
+        }
+        // Set the end timestamp after processing
+        requestTraces[traceIndex].timestamp_end = new Date().toISOString();
+        // Calculate duration if we have both timestamps
+        if (requestTraces[traceIndex].timestamp_start && requestTraces[traceIndex].timestamp_end) {
+          const startTime = new Date(requestTraces[traceIndex].timestamp_start).getTime();
+          const endTime = new Date(requestTraces[traceIndex].timestamp_end).getTime();
+          requestTraces[traceIndex].duration_ms = endTime - startTime;
+        }
+        checkAndFlushTraces();
+    } catch (error) {
+        requestTraces[traceIndex].timestamp_end = new Date().toISOString();
+        // Calculate duration if we have both timestamps
+        if (requestTraces[traceIndex].timestamp_start && requestTraces[traceIndex].timestamp_end) {
+          const startTime = new Date(requestTraces[traceIndex].timestamp_start).getTime();
+          const endTime = new Date(requestTraces[traceIndex].timestamp_end).getTime();
+          requestTraces[traceIndex].duration_ms = endTime - startTime;
+        }
+        checkAndFlushTraces();
+    } finally {
+        try {
+            reader.releaseLock();
+        } catch {
+            // Ignore release errors
+        }
+    }
+}
+app.all('*', async (c) => {
+  try {
+    // check if the user is authorized to access the dataset
+    if (USER_NAME && !await checkUserAccess(USER_NAME)) {
+      return c.text('Unauthorized', 401);
+    }
+    const url = new URL(c.req.url);
+    const targetPath = url.pathname;
+    const targetUrl = `${TARGET_BASE_URL}${targetPath}${url.search}`;
+    // Extract provider from the URL path
+    const pathParts = targetPath.split('/');
+    const provider = pathParts.length > 1 ? pathParts[1] : 'unknown';
+    console.log(`Forwarding request for ${url.pathname} to ${targetUrl}`);
+    const headers = new Headers(c.req.header());
+    headers.delete('host');
+    headers.set('host', new URL(TARGET_BASE_URL).host);
+    headers.delete('content-length');
+    headers.delete('transfer-encoding');
+    let requestBody: BodyInit | null = null;
+    let parsedRequestBody: any = null;
+    const incomingContentType = c.req.header('content-type') || '';
+    const methodNeedsBody = !['GET', 'HEAD'].includes(c.req.method);
+    if (methodNeedsBody && c.req.raw.body) {
+      if (incomingContentType.includes('application/json')) {
+        try {
+          const rawBodyText = await c.req.text();
+          parsedRequestBody = JSON.parse(rawBodyText);
+          requestBody = rawBodyText;
+        } catch (e) {
+          console.warn("Failed to parse incoming JSON body, forwarding raw body:", e);
+           try {
+               requestBody = await c.req.blob();
+           } catch (blobError) {
+                console.error("Could not retrieve request body after JSON parse failure:", blobError);
+                requestBody = null;
+           }
+        }
+      } else {
+        requestBody = c.req.raw.body;
+      }
+    }
+    const traceIndex = requestTraces.length;
+    const traceEntry: typeof requestTraces[0] = {
+      timestamp_start: new Date().toISOString(),
+      provider
+    };
+    if (parsedRequestBody) {
+      if (parsedRequestBody.model) {
+        traceEntry.model = parsedRequestBody.model;
+      } else if (targetPath.includes('/models/') || targetPath.includes('/model/')) {
+        const pathParts = targetPath.split('/');
+        const modelIndex = pathParts.findIndex(part => part === 'models' || part === 'model');
+        if (modelIndex >= 0 && pathParts.length > modelIndex + 1) {
+          traceEntry.model = pathParts[modelIndex + 1];
+        }
+      }
+      if (parsedRequestBody.messages) {
+        traceEntry.messages = parsedRequestBody.messages;
+        let promptText = '';
+        for (const message of parsedRequestBody.messages) {
+          if (message.content) {
+            promptText += message.content;
+          }
+        }
+        traceEntry.prompt_tokens = promptText.length;
+      }
+      if (parsedRequestBody.arguments) {
+        traceEntry.arguments = parsedRequestBody.arguments;
+      } else if (parsedRequestBody.parameters) {
+        traceEntry.arguments = parsedRequestBody.parameters;
+      }
+    }
+    requestTraces.push(traceEntry);
+    // Check if we need to flush based on batch size
+    checkAndFlushTraces();
+    const response = await fetch(targetUrl, {
+      method: c.req.method,
+      headers: headers,
+      body: requestBody,
+    });
+    console.log(`Received response status ${response.status} from ${targetUrl}`);
+    c.status(response.status as StatusCode);
+    response.headers.forEach((value, key) => {
+        if (key.toLowerCase() !== 'content-encoding' && key.toLowerCase() !== 'transfer-encoding') {
+             c.header(key, value);
+        }
+    });
+    if (!response.headers.has('content-type')) {
+         c.header('content-type', 'application/octet-stream');
+    }
+    if (response.body) {
+        const [streamForClient, streamForStorage] = response.body.tee();
+        const contentType = response.headers.get('content-type');
+        storeStreamedResponse(streamForStorage, contentType, targetUrl, traceIndex).catch(err => {
+             console.error("Error in background stream storage:", err);
+        });
+        return stream(c, async (streamInstance) => {
+            await streamInstance.pipe(streamForClient);
+        });
+    } else {
+        console.log(`Received response with no body from ${targetUrl}.`);
+        requestTraces[traceIndex].timestamp_end = new Date().toISOString();
+        // Calculate duration if we have both timestamps
+        if (requestTraces[traceIndex].timestamp_start && requestTraces[traceIndex].timestamp_end) {
+          const startTime = new Date(requestTraces[traceIndex].timestamp_start).getTime();
+          const endTime = new Date(requestTraces[traceIndex].timestamp_end).getTime();
+          requestTraces[traceIndex].duration_ms = endTime - startTime;
+        }
+        // Check if we need to flush based on batch size
+        checkAndFlushTraces();
+        return c.body(null);
+    }
+  } catch (error) {
+    console.error('Error during proxy request:', error);
+    return c.text('Internal Server Error', 500);
+  }
+});
+// Ensure we flush any remaining traces when the process is terminating
+process.on('SIGINT', () => {
+  console.log('Process terminating, flushing remaining traces...');
+  writeBatchedTraces().then(() => {
+    process.exit();
+  }).catch(err => {
+    console.error('Error flushing traces on shutdown:', err);
+    process.exit(1);
+  });
+});
+process.on('SIGTERM', () => {
+  console.log('Process terminating, flushing remaining traces...');
+  writeBatchedTraces().then(() => {
+    process.exit();
+  }).catch(err => {
+    console.error('Error flushing traces on shutdown:', err);
+    process.exit(1);
+  });
+});
+console.log(`Inference Proxy running on port ${PORT}`);
+console.log(`Forwarding to: ${TARGET_BASE_URL}`);
+console.log(`Logs directory: ${resolve(LOGS_DIR)}`);
+console.log(`Batching: max ${BATCH_SIZE_LIMIT} traces or ${BATCH_TIME_LIMIT} minutes`);
+export default {
+  port: PORT,
+  fetch: app.fetch,
+};

package.json ADDED Viewed

	@@ -0,0 +1,18 @@

+{
+  "name": "inference-proxy",
+  "module": "index.ts",
+  "type": "module",
+  "devDependencies": {
+    "@types/bun": "latest"
+  },
+  "peerDependencies": {
+    "typescript": "^5.0.0"
+  },
+  "dependencies": {
+    "@huggingface/hub": "^1.1.2",
+    "@huggingface/inference": "^3.7.0",
+    "hono": "^4.7.5",
+    "openai": "^4.92.0",
+    "uuid": "^11.1.0"
+  }
+}

tsconfig.json ADDED Viewed

	@@ -0,0 +1,27 @@

+{
+  "compilerOptions": {
+    // Enable latest features
+    "lib": ["ESNext", "DOM"],
+    "target": "ESNext",
+    "module": "ESNext",
+    "moduleDetection": "force",
+    "jsx": "react-jsx",
+    "allowJs": true,
+    // Bundler mode
+    "moduleResolution": "bundler",
+    "allowImportingTsExtensions": true,
+    "verbatimModuleSyntax": true,
+    "noEmit": true,
+    // Best practices
+    "strict": true,
+    "skipLibCheck": true,
+    "noFallthroughCasesInSwitch": true,
+    // Some stricter flags (disabled by default)
+    "noUnusedLocals": false,
+    "noUnusedParameters": false,
+    "noPropertyAccessFromIndexSignature": false
+  }
+}