Spaces:

mistralai
/

Ministral_3B_WebGPU

Running

App Files Files Community

Additional screen as input

by oliveregger - opened 3 days ago

base: refs/heads/main

←

from: refs/pr/3

Discussion Files changed

-1

oliveregger

3 days ago

No description provided.

Update src/components/WelcomeScreen.tsx13f570b3

oliveregger changed pull request title from Update src/components/WelcomeScreen.tsx to Additional screen as input 3 days ago

oliveregger

3 days ago

I tried to push my changes to the PR but i get

remote: -------------------------------------------------------------------------
remote: You are not authorized to push to some of the references:
remote: - refs/heads/pr/3: forbidden
remote: -------------------------------------------------------------------------

i attached the patch for your consideration if you want to include screen as input

From 17d71b0b5f1bac5d790460460171b7bf27d05a67 Mon Sep 17 00:00:00 2001
From: oliveregger <oliver.egger@ahdis.ch>
Date: Thu, 4 Dec 2025 11:32:37 +0100
Subject: [PATCH] add screen as input in addition to camera

---
 .gitignore                                | 67 +++++++++++++++++
 src/App.tsx                               |  6 +-
 src/components/CaptioningView.tsx         |  4 +-
 src/components/WebcamCapture.tsx          |  4 +-
 src/components/WebcamPermissionDialog.tsx | 91 ++++++++++++++++++-----
 src/components/WelcomeScreen.tsx          |  2 +-
 6 files changed, 152 insertions(+), 22 deletions(-)
 create mode 100644 .gitignore

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..e00f300
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,67 @@
+# Logs
+logs
+*.log
+npm-debug.log*
+yarn-debug.log*
+yarn-error.log*
+pnpm-debug.log*
+lerna-debug.log*
+
+# Dependencies
+node_modules
+.pnp
+.pnp.js
+
+# Build outputs
+dist
+dist-ssr
+*.local
+
+# Editor directories and files
+.vscode/*
+!.vscode/extensions.json
+.idea
+.DS_Store
+*.suo
+*.ntvs*
+*.njsproj
+*.sln
+*.sw?
+
+# Environment variables
+.env
+.env.local
+.env.development.local
+.env.test.local
+.env.production.local
+
+# Testing
+coverage
+*.lcov
+.nyc_output
+
+# Temporary files
+*.tmp
+*.temp
+.cache
+
+# OS files
+Thumbs.db
+.DS_Store
+
+# TypeScript
+*.tsbuildinfo
+
+# Optional npm cache directory
+.npm
+
+# Optional eslint cache
+.eslintcache
+
+# Optional stylelint cache
+.stylelintcache
+
+# Vite
+.vite
+vite.config.js.timestamp-*
+vite.config.ts.timestamp-*
diff --git a/src/App.tsx b/src/App.tsx
index 430de21..bff17d3 100644
--- a/src/App.tsx
+++ b/src/App.tsx
@@ -9,10 +9,12 @@ export default function App() {
   const [appState, setAppState] = useState<AppState>("requesting-permission");
   const [webcamStream, setWebcamStream] = useState<MediaStream | null>(null);
   const [isVideoReady, setIsVideoReady] = useState(false);
+  const [sourceType, setSourceType] = useState<"camera" | "screen">("camera");
   const videoRef = useRef<HTMLVideoElement | null>(null);
 
-  const handlePermissionGranted = useCallback((stream: MediaStream) => {
+  const handlePermissionGranted = useCallback((stream: MediaStream, source: "camera" | "screen") => {
     setWebcamStream(stream);
+    setSourceType(source);
     setAppState("welcome");
   }, []);
 
@@ -109,7 +111,7 @@ export default function App() {
         <LoadingScreen onComplete={handleLoadingComplete} />
       )}
 
-      {appState === "captioning" && <CaptioningView videoRef={videoRef} />}
+      {appState === "captioning" && <CaptioningView videoRef={videoRef} sourceType={sourceType} />}
     </div>
   );
 }
diff --git a/src/components/CaptioningView.tsx b/src/components/CaptioningView.tsx
index 2c58085..a0c77bb 100644
--- a/src/components/CaptioningView.tsx
+++ b/src/components/CaptioningView.tsx
@@ -7,6 +7,7 @@ import { PROMPTS, TIMING } from "../constants";
 
 interface CaptioningViewProps {
   videoRef: React.RefObject<HTMLVideoElement | null>;
+  sourceType: "camera" | "screen";
 }
 
 function useCaptioningLoop(
@@ -94,7 +95,7 @@ function useCaptioningLoop(
   }, [isRunning, isLoaded, runInference, promptRef, videoRef]);
 }
 
-export default function CaptioningView({ videoRef }: CaptioningViewProps) {
+export default function CaptioningView({ videoRef, sourceType }: CaptioningViewProps) {
   const { imageSize, setImageSize } = useVLMContext();
   const [caption, setCaption] = useState<string>("");
   const [isLoopRunning, setIsLoopRunning] = useState<boolean>(true);
@@ -177,6 +178,7 @@ export default function CaptioningView({ videoRef }: CaptioningViewProps) {
           error={error}
           imageSize={imageSize}
           onImageSizeChange={setImageSize}
+          sourceType={sourceType}
         />
 
         {/* Prompt Input - Bottom Left */}
diff --git a/src/components/WebcamCapture.tsx b/src/components/WebcamCapture.tsx
index 2f70b43..a982dd0 100644
--- a/src/components/WebcamCapture.tsx
+++ b/src/components/WebcamCapture.tsx
@@ -7,6 +7,7 @@ interface WebcamCaptureProps {
   error?: string | null;
   imageSize?: number;
   onImageSizeChange?: (size: number) => void;
+  sourceType: "camera" | "screen";
 }
 
 export default function WebcamCapture({
@@ -15,6 +16,7 @@ export default function WebcamCapture({
   error,
   imageSize,
   onImageSizeChange,
+  sourceType,
 }: WebcamCaptureProps) {
   const hasError = Boolean(error);
 
@@ -26,7 +28,7 @@ export default function WebcamCapture({
       }
     : isRunning
       ? {
-          text: "LIVE FEED",
+          text: sourceType === "screen" ? "SCREEN CAPTURE" : "LIVE FEED",
           color: "bg-[var(--mistral-orange)] animate-pulse",
           border: "border-[var(--mistral-orange)]",
         }
diff --git a/src/components/WebcamPermissionDialog.tsx b/src/components/WebcamPermissionDialog.tsx
index dae8bc0..0aa5d03 100644
--- a/src/components/WebcamPermissionDialog.tsx
+++ b/src/components/WebcamPermissionDialog.tsx
@@ -17,13 +17,15 @@ const VIDEO_CONSTRAINTS = {
   },
 };
 
+type SourceType = "camera" | "screen";
+
 interface ErrorInfo {
   type: (typeof ERROR_TYPES)[keyof typeof ERROR_TYPES];
   message: string;
 }
 
 interface WebcamPermissionDialogProps {
-  onPermissionGranted: (stream: MediaStream) => void;
+  onPermissionGranted: (stream: MediaStream, sourceType: SourceType) => void;
 }
 
 export default function WebcamPermissionDialog({
@@ -31,6 +33,7 @@ export default function WebcamPermissionDialog({
 }: WebcamPermissionDialogProps) {
   const [isRequesting, setIsRequesting] = useState(false);
   const [error, setError] = useState<ErrorInfo | null>(null);
+  const [selectedSource, setSelectedSource] = useState<SourceType | null>(null);
 
   const [mounted, setMounted] = useState(false);
   useEffect(() => setMounted(true), []);
@@ -91,31 +94,40 @@ export default function WebcamPermissionDialog({
     };
   };
 
-  const requestWebcamAccess = useCallback(async () => {
+  const requestAccess = useCallback(async (sourceType: SourceType) => {
     setIsRequesting(true);
     setError(null);
+    setSelectedSource(sourceType);
 
     try {
-      if (!navigator.mediaDevices?.getUserMedia) {
-        throw new Error("NOT_SUPPORTED");
+      let stream: MediaStream;
+
+      if (sourceType === "camera") {
+        if (!navigator.mediaDevices?.getUserMedia) {
+          throw new Error("NOT_SUPPORTED");
+        }
+        stream = await navigator.mediaDevices.getUserMedia(VIDEO_CONSTRAINTS);
+      } else {
+        // Screen capture
+        if (!navigator.mediaDevices?.getDisplayMedia) {
+          throw new Error("NOT_SUPPORTED");
+        }
+        stream = await navigator.mediaDevices.getDisplayMedia({
+          video: true,
+          audio: false,
+        } as DisplayMediaStreamOptions);
       }
 
-      const stream =
-        await navigator.mediaDevices.getUserMedia(VIDEO_CONSTRAINTS);
-      onPermissionGranted(stream);
+      onPermissionGranted(stream, sourceType);
     } catch (err) {
       const errorInfo = getErrorInfo(err);
       setError(errorInfo);
-      console.error("Error accessing webcam:", err, errorInfo);
+      console.error(`Error accessing ${sourceType}:`, err, errorInfo);
     } finally {
       setIsRequesting(false);
     }
   }, [onPermissionGranted]);
 
-  useEffect(() => {
-    requestWebcamAccess();
-  }, [requestWebcamAccess]);
-
   const troubleshootingData = useMemo(
     () => ({
       [ERROR_TYPES.HTTPS]: {
@@ -209,15 +221,15 @@ export default function WebcamPermissionDialog({
   };
 
   const getTitle = () => {
-    if (isRequesting) return "Initialize Camera";
+    if (isRequesting) return selectedSource === "screen" ? "Initialize Screen Capture" : "Initialize Camera";
     if (error) return "Connection Failed";
-    return "Permission Required";
+    return "Select Video Source";
   };
 
   const getDescription = () => {
-    if (isRequesting) return "Requesting access to video input device...";
+    if (isRequesting) return "Requesting access to video source...";
     if (error) return error.message;
-    return "Ministral WebGPU requires local camera access for real-time inference.";
+    return "Choose your video source for real-time visual inference.";
   };
 
   return (
@@ -324,12 +336,57 @@ export default function WebcamPermissionDialog({
               </p>
             </div>
 
+            {/* Source Selection Buttons */}
+            {!isRequesting && !error && (
+              <div className="flex flex-col gap-3">
+                <Button
+                  onClick={() => requestAccess("camera")}
+                  className="w-full px-6 py-4 text-white shadow-lg hover:shadow-xl transition-all font-semibold tracking-wide hover:bg-[var(--mistral-orange-dark)] flex items-center justify-center gap-3"
+                >
+                  <svg
+                    className="w-6 h-6"
+                    fill="none"
+                    viewBox="0 0 24 24"
+                    stroke="currentColor"
+                    strokeWidth={2}
+                  >
+                    <path
+                      strokeLinecap="round"
+                      strokeLinejoin="round"
+                      d="M15 10l4.553-2.276A1 1 0 0121 8.618v6.764a1 1 0 01-1.447.894L15 14M5 18h8a2 2 0 002-2V8a2 2 0 00-2-2H5a2 2 0 00-2 2v8a2 2 0 002 2z"
+                    />
+                  </svg>
+                  Use Camera
+                </Button>
+
+                <Button
+                  onClick={() => requestAccess("screen")}
+                  className="w-full px-6 py-4 bg-gray-700 text-white shadow-lg hover:shadow-xl transition-all font-semibold tracking-wide hover:bg-gray-800 flex items-center justify-center gap-3"
+                >
+                  <svg
+                    className="w-6 h-6"
+                    fill="none"
+                    viewBox="0 0 24 24"
+                    stroke="currentColor"
+                    strokeWidth={2}
+                  >
+                    <path
+                      strokeLinecap="round"
+                      strokeLinejoin="round"
+                      d="M9.75 17L9 20l-1 1h8l-1-1-.75-3M3 13h18M5 17h14a2 2 0 002-2V5a2 2 0 00-2-2H5a2 2 0 00-2 2v10a2 2 0 002 2z"
+                    />
+                  </svg>
+                  Capture Screen
+                </Button>
+              </div>
+            )}
+
             {/* Error Actions */}
             {error && (
               <div className="animate-enter">
                 <div className="flex justify-center mb-6">
                   <Button
-                    onClick={requestWebcamAccess}
+                    onClick={() => requestAccess(selectedSource || "camera")}
                     disabled={isRequesting}
                     className="px-8 py-3 text-white shadow-lg hover:shadow-xl transition-all font-semibold tracking-wide hover:bg-[var(--mistral-orange-dark)]"
                   >
diff --git a/src/components/WelcomeScreen.tsx b/src/components/WelcomeScreen.tsx
index d69ec7e..4c58392 100644
--- a/src/components/WelcomeScreen.tsx
+++ b/src/components/WelcomeScreen.tsx
@@ -131,7 +131,7 @@ export default function WelcomeScreen({ onStart }: WelcomeScreenProps) {
                   Private & Local
                 </h4>
                 <p className="text-gray-600 leading-relaxed">
-                  Your video feed is processed locally and never sent to a
+                  Your video source (camera or screen) is processed locally and never sent to a
                   server, powered by
                   <a href="https://github.com/huggingface/transformers.js">
                     <span className="font-medium underline">
-- 
2.50.1 (Apple Git-155)

Jofthomas

Mistral AI_ org 3 days ago

That's a great idea and we will indeed try to make the demo involve in that direction of having multiple sources

oliveregger

2 days ago

i was raterestricted ... see https://huggingface.co/spaces/oliveregger/Ministral_3B_WebGPU where i applied above patch, great work you have done with that example, thanks a lot!

Upload images, audio, and videos by dragging in the text input, pasting, or clicking here.

Tap or paste here to upload images

Ready to merge

This branch is ready to get merged automatically.

· Sign up or log in to comment