Kyryll Kochkin commited on
Commit
f65a41f
·
1 Parent(s): f64f759

Add endpoint health monitoring to root status

Browse files
Files changed (1) hide show
  1. app/main.py +100 -3
app/main.py CHANGED
@@ -1,13 +1,18 @@
1
  """FastAPI application entrypoint."""
2
  from __future__ import annotations
3
 
 
 
4
  import logging
 
5
  from logging.config import dictConfig
6
- from typing import Any, Dict
7
 
 
8
  from fastapi import FastAPI, HTTPException, Request
9
  from fastapi.middleware.cors import CORSMiddleware
10
  from fastapi.responses import JSONResponse
 
11
 
12
  from .core.settings import get_settings
13
  from .routers import chat, completions, embeddings, models
@@ -38,6 +43,75 @@ logger = logging.getLogger(__name__)
38
 
39
  app = FastAPI(title="GPT3dev OpenAI-Compatible API", version="1.0.0")
40
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
  if settings.cors_allow_origins:
42
  app.add_middleware(
43
  CORSMiddleware,
@@ -59,9 +133,21 @@ async def healthcheck() -> Dict[str, str]:
59
 
60
 
61
  @app.get("/")
62
- async def root() -> Dict[str, str]:
63
  """Root endpoint used by platform health checks (e.g., HF Spaces)."""
64
- return {"status": "ok", "message": "GPT3dev API is running"}
 
 
 
 
 
 
 
 
 
 
 
 
65
 
66
 
67
  @app.on_event("startup")
@@ -74,6 +160,17 @@ async def on_startup() -> None:
74
  except Exception: # pragma: no cover - defensive logging only
75
  models = "(unavailable)"
76
  logger.info("API startup complete. Log level=%s. Models=[%s]", settings.log_level, models)
 
 
 
 
 
 
 
 
 
 
 
77
 
78
 
79
  @app.exception_handler(HTTPException)
 
1
  """FastAPI application entrypoint."""
2
  from __future__ import annotations
3
 
4
+ import asyncio
5
+ import contextlib
6
  import logging
7
+ from datetime import datetime, timezone
8
  from logging.config import dictConfig
9
+ from typing import Any, Dict, List, Optional
10
 
11
+ import httpx
12
  from fastapi import FastAPI, HTTPException, Request
13
  from fastapi.middleware.cors import CORSMiddleware
14
  from fastapi.responses import JSONResponse
15
+ from fastapi.routing import APIRoute
16
 
17
  from .core.settings import get_settings
18
  from .routers import chat, completions, embeddings, models
 
43
 
44
  app = FastAPI(title="GPT3dev OpenAI-Compatible API", version="1.0.0")
45
 
46
+ CHECK_INTERVAL_SECONDS = 60
47
+ IGNORED_MONITOR_PATHS = {"/"}
48
+
49
+ EndpointStatus = Dict[str, Dict[str, Any]]
50
+
51
+ _endpoint_status: Dict[str, Any] = {"failures": {}, "last_checked": None}
52
+ _endpoint_monitor_task: Optional[asyncio.Task[None]] = None
53
+
54
+
55
+ def _monitored_endpoints() -> List[str]:
56
+ endpoints: List[str] = []
57
+ for route in app.routes:
58
+ if not isinstance(route, APIRoute):
59
+ continue
60
+ if "GET" not in (route.methods or set()):
61
+ continue
62
+ if route.path in IGNORED_MONITOR_PATHS:
63
+ continue
64
+ if route.dependant.path_params:
65
+ continue
66
+ if not route.include_in_schema:
67
+ continue
68
+ endpoints.append(route.path)
69
+ return sorted(set(endpoints))
70
+
71
+
72
+ async def _poll_endpoint_health() -> None:
73
+ previous_failures: set[str] = set()
74
+ async with httpx.AsyncClient(app=app, base_url="http://status-check", timeout=10.0) as client:
75
+ while True:
76
+ try:
77
+ monitored_paths = _monitored_endpoints()
78
+ failures: Dict[str, Dict[str, Any]] = {}
79
+ for path in monitored_paths:
80
+ try:
81
+ response = await client.get(path)
82
+ except httpx.HTTPError as exc:
83
+ failures[path] = {"error": str(exc)}
84
+ continue
85
+ except Exception as exc: # pragma: no cover - defensive
86
+ failures[path] = {"error": str(exc)}
87
+ continue
88
+ if not 200 <= response.status_code < 400:
89
+ failures[path] = {
90
+ "status_code": response.status_code,
91
+ "detail": response.text[:200],
92
+ }
93
+ _endpoint_status["failures"] = failures
94
+ _endpoint_status["last_checked"] = datetime.now(timezone.utc).isoformat()
95
+ current_failures = set(failures.keys())
96
+ if current_failures != previous_failures:
97
+ if current_failures:
98
+ logger.warning("Endpoint monitor detected failures: %s", sorted(current_failures))
99
+ elif previous_failures:
100
+ logger.info("All monitored endpoints restored")
101
+ previous_failures = current_failures
102
+ await asyncio.sleep(CHECK_INTERVAL_SECONDS)
103
+ except asyncio.CancelledError: # pragma: no cover - shutdown handling
104
+ raise
105
+ except Exception: # pragma: no cover - defensive logging only
106
+ logger.exception("Unexpected error during endpoint monitoring")
107
+ await asyncio.sleep(CHECK_INTERVAL_SECONDS)
108
+
109
+
110
+ def _ensure_monitor_task() -> None:
111
+ global _endpoint_monitor_task
112
+ if _endpoint_monitor_task is None or _endpoint_monitor_task.done():
113
+ _endpoint_monitor_task = asyncio.create_task(_poll_endpoint_health())
114
+
115
  if settings.cors_allow_origins:
116
  app.add_middleware(
117
  CORSMiddleware,
 
133
 
134
 
135
  @app.get("/")
136
+ async def root() -> Dict[str, Any]:
137
  """Root endpoint used by platform health checks (e.g., HF Spaces)."""
138
+ base_response: Dict[str, Any] = {"status": "ok", "message": "GPT3dev API is running"}
139
+ failures: EndpointStatus = _endpoint_status.get("failures", {})
140
+ if not failures:
141
+ return base_response
142
+ degraded_response = dict(base_response)
143
+ degraded_response["status"] = "degraded"
144
+ degraded_response["issues"] = [
145
+ {"endpoint": path, **details} for path, details in sorted(failures.items())
146
+ ]
147
+ last_checked = _endpoint_status.get("last_checked")
148
+ if last_checked:
149
+ degraded_response["last_checked"] = last_checked
150
+ return degraded_response
151
 
152
 
153
  @app.on_event("startup")
 
160
  except Exception: # pragma: no cover - defensive logging only
161
  models = "(unavailable)"
162
  logger.info("API startup complete. Log level=%s. Models=[%s]", settings.log_level, models)
163
+ _ensure_monitor_task()
164
+
165
+
166
+ @app.on_event("shutdown")
167
+ async def on_shutdown() -> None:
168
+ global _endpoint_monitor_task
169
+ if _endpoint_monitor_task is not None:
170
+ _endpoint_monitor_task.cancel()
171
+ with contextlib.suppress(asyncio.CancelledError):
172
+ await _endpoint_monitor_task
173
+ _endpoint_monitor_task = None
174
 
175
 
176
  @app.exception_handler(HTTPException)