Additional screen as input

#3
No description provided.
oliveregger changed pull request title from Update src/components/WelcomeScreen.tsx to Additional screen as input

I tried to push my changes to the PR but i get

remote: -------------------------------------------------------------------------
remote: You are not authorized to push to some of the references:
remote: - refs/heads/pr/3: forbidden
remote: -------------------------------------------------------------------------

i attached the patch for your consideration if you want to include screen as input

From 17d71b0b5f1bac5d790460460171b7bf27d05a67 Mon Sep 17 00:00:00 2001
From: oliveregger <oliver.egger@ahdis.ch>
Date: Thu, 4 Dec 2025 11:32:37 +0100
Subject: [PATCH] add screen as input in addition to camera

---
 .gitignore                                | 67 +++++++++++++++++
 src/App.tsx                               |  6 +-
 src/components/CaptioningView.tsx         |  4 +-
 src/components/WebcamCapture.tsx          |  4 +-
 src/components/WebcamPermissionDialog.tsx | 91 ++++++++++++++++++-----
 src/components/WelcomeScreen.tsx          |  2 +-
 6 files changed, 152 insertions(+), 22 deletions(-)
 create mode 100644 .gitignore

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..e00f300
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,67 @@
+# Logs
+logs
+*.log
+npm-debug.log*
+yarn-debug.log*
+yarn-error.log*
+pnpm-debug.log*
+lerna-debug.log*
+
+# Dependencies
+node_modules
+.pnp
+.pnp.js
+
+# Build outputs
+dist
+dist-ssr
+*.local
+
+# Editor directories and files
+.vscode/*
+!.vscode/extensions.json
+.idea
+.DS_Store
+*.suo
+*.ntvs*
+*.njsproj
+*.sln
+*.sw?
+
+# Environment variables
+.env
+.env.local
+.env.development.local
+.env.test.local
+.env.production.local
+
+# Testing
+coverage
+*.lcov
+.nyc_output
+
+# Temporary files
+*.tmp
+*.temp
+.cache
+
+# OS files
+Thumbs.db
+.DS_Store
+
+# TypeScript
+*.tsbuildinfo
+
+# Optional npm cache directory
+.npm
+
+# Optional eslint cache
+.eslintcache
+
+# Optional stylelint cache
+.stylelintcache
+
+# Vite
+.vite
+vite.config.js.timestamp-*
+vite.config.ts.timestamp-*
diff --git a/src/App.tsx b/src/App.tsx
index 430de21..bff17d3 100644
--- a/src/App.tsx
+++ b/src/App.tsx
@@ -9,10 +9,12 @@ export default function App() {
   const [appState, setAppState] = useState<AppState>("requesting-permission");
   const [webcamStream, setWebcamStream] = useState<MediaStream | null>(null);
   const [isVideoReady, setIsVideoReady] = useState(false);
+  const [sourceType, setSourceType] = useState<"camera" | "screen">("camera");
   const videoRef = useRef<HTMLVideoElement | null>(null);
 
-  const handlePermissionGranted = useCallback((stream: MediaStream) => {
+  const handlePermissionGranted = useCallback((stream: MediaStream, source: "camera" | "screen") => {
     setWebcamStream(stream);
+    setSourceType(source);
     setAppState("welcome");
   }, []);
 
@@ -109,7 +111,7 @@ export default function App() {
         <LoadingScreen onComplete={handleLoadingComplete} />
       )}
 
-      {appState === "captioning" && <CaptioningView videoRef={videoRef} />}
+      {appState === "captioning" && <CaptioningView videoRef={videoRef} sourceType={sourceType} />}
     </div>
   );
 }
diff --git a/src/components/CaptioningView.tsx b/src/components/CaptioningView.tsx
index 2c58085..a0c77bb 100644
--- a/src/components/CaptioningView.tsx
+++ b/src/components/CaptioningView.tsx
@@ -7,6 +7,7 @@ import { PROMPTS, TIMING } from "../constants";
 
 interface CaptioningViewProps {
   videoRef: React.RefObject<HTMLVideoElement | null>;
+  sourceType: "camera" | "screen";
 }
 
 function useCaptioningLoop(
@@ -94,7 +95,7 @@ function useCaptioningLoop(
   }, [isRunning, isLoaded, runInference, promptRef, videoRef]);
 }
 
-export default function CaptioningView({ videoRef }: CaptioningViewProps) {
+export default function CaptioningView({ videoRef, sourceType }: CaptioningViewProps) {
   const { imageSize, setImageSize } = useVLMContext();
   const [caption, setCaption] = useState<string>("");
   const [isLoopRunning, setIsLoopRunning] = useState<boolean>(true);
@@ -177,6 +178,7 @@ export default function CaptioningView({ videoRef }: CaptioningViewProps) {
           error={error}
           imageSize={imageSize}
           onImageSizeChange={setImageSize}
+          sourceType={sourceType}
         />
 
         {/* Prompt Input - Bottom Left */}
diff --git a/src/components/WebcamCapture.tsx b/src/components/WebcamCapture.tsx
index 2f70b43..a982dd0 100644
--- a/src/components/WebcamCapture.tsx
+++ b/src/components/WebcamCapture.tsx
@@ -7,6 +7,7 @@ interface WebcamCaptureProps {
   error?: string | null;
   imageSize?: number;
   onImageSizeChange?: (size: number) => void;
+  sourceType: "camera" | "screen";
 }
 
 export default function WebcamCapture({
@@ -15,6 +16,7 @@ export default function WebcamCapture({
   error,
   imageSize,
   onImageSizeChange,
+  sourceType,
 }: WebcamCaptureProps) {
   const hasError = Boolean(error);
 
@@ -26,7 +28,7 @@ export default function WebcamCapture({
       }
     : isRunning
       ? {
-          text: "LIVE FEED",
+          text: sourceType === "screen" ? "SCREEN CAPTURE" : "LIVE FEED",
           color: "bg-[var(--mistral-orange)] animate-pulse",
           border: "border-[var(--mistral-orange)]",
         }
diff --git a/src/components/WebcamPermissionDialog.tsx b/src/components/WebcamPermissionDialog.tsx
index dae8bc0..0aa5d03 100644
--- a/src/components/WebcamPermissionDialog.tsx
+++ b/src/components/WebcamPermissionDialog.tsx
@@ -17,13 +17,15 @@ const VIDEO_CONSTRAINTS = {
   },
 };
 
+type SourceType = "camera" | "screen";
+
 interface ErrorInfo {
   type: (typeof ERROR_TYPES)[keyof typeof ERROR_TYPES];
   message: string;
 }
 
 interface WebcamPermissionDialogProps {
-  onPermissionGranted: (stream: MediaStream) => void;
+  onPermissionGranted: (stream: MediaStream, sourceType: SourceType) => void;
 }
 
 export default function WebcamPermissionDialog({
@@ -31,6 +33,7 @@ export default function WebcamPermissionDialog({
 }: WebcamPermissionDialogProps) {
   const [isRequesting, setIsRequesting] = useState(false);
   const [error, setError] = useState<ErrorInfo | null>(null);
+  const [selectedSource, setSelectedSource] = useState<SourceType | null>(null);
 
   const [mounted, setMounted] = useState(false);
   useEffect(() => setMounted(true), []);
@@ -91,31 +94,40 @@ export default function WebcamPermissionDialog({
     };
   };
 
-  const requestWebcamAccess = useCallback(async () => {
+  const requestAccess = useCallback(async (sourceType: SourceType) => {
     setIsRequesting(true);
     setError(null);
+    setSelectedSource(sourceType);
 
     try {
-      if (!navigator.mediaDevices?.getUserMedia) {
-        throw new Error("NOT_SUPPORTED");
+      let stream: MediaStream;
+
+      if (sourceType === "camera") {
+        if (!navigator.mediaDevices?.getUserMedia) {
+          throw new Error("NOT_SUPPORTED");
+        }
+        stream = await navigator.mediaDevices.getUserMedia(VIDEO_CONSTRAINTS);
+      } else {
+        // Screen capture
+        if (!navigator.mediaDevices?.getDisplayMedia) {
+          throw new Error("NOT_SUPPORTED");
+        }
+        stream = await navigator.mediaDevices.getDisplayMedia({
+          video: true,
+          audio: false,
+        } as DisplayMediaStreamOptions);
       }
 
-      const stream =
-        await navigator.mediaDevices.getUserMedia(VIDEO_CONSTRAINTS);
-      onPermissionGranted(stream);
+      onPermissionGranted(stream, sourceType);
     } catch (err) {
       const errorInfo = getErrorInfo(err);
       setError(errorInfo);
-      console.error("Error accessing webcam:", err, errorInfo);
+      console.error(`Error accessing ${sourceType}:`, err, errorInfo);
     } finally {
       setIsRequesting(false);
     }
   }, [onPermissionGranted]);
 
-  useEffect(() => {
-    requestWebcamAccess();
-  }, [requestWebcamAccess]);
-
   const troubleshootingData = useMemo(
     () => ({
       [ERROR_TYPES.HTTPS]: {
@@ -209,15 +221,15 @@ export default function WebcamPermissionDialog({
   };
 
   const getTitle = () => {
-    if (isRequesting) return "Initialize Camera";
+    if (isRequesting) return selectedSource === "screen" ? "Initialize Screen Capture" : "Initialize Camera";
     if (error) return "Connection Failed";
-    return "Permission Required";
+    return "Select Video Source";
   };
 
   const getDescription = () => {
-    if (isRequesting) return "Requesting access to video input device...";
+    if (isRequesting) return "Requesting access to video source...";
     if (error) return error.message;
-    return "Ministral WebGPU requires local camera access for real-time inference.";
+    return "Choose your video source for real-time visual inference.";
   };
 
   return (
@@ -324,12 +336,57 @@ export default function WebcamPermissionDialog({
               </p>
             </div>
 
+            {/* Source Selection Buttons */}
+            {!isRequesting && !error && (
+              <div className="flex flex-col gap-3">
+                <Button
+                  onClick={() => requestAccess("camera")}
+                  className="w-full px-6 py-4 text-white shadow-lg hover:shadow-xl transition-all font-semibold tracking-wide hover:bg-[var(--mistral-orange-dark)] flex items-center justify-center gap-3"
+                >
+                  <svg
+                    className="w-6 h-6"
+                    fill="none"
+                    viewBox="0 0 24 24"
+                    stroke="currentColor"
+                    strokeWidth={2}
+                  >
+                    <path
+                      strokeLinecap="round"
+                      strokeLinejoin="round"
+                      d="M15 10l4.553-2.276A1 1 0 0121 8.618v6.764a1 1 0 01-1.447.894L15 14M5 18h8a2 2 0 002-2V8a2 2 0 00-2-2H5a2 2 0 00-2 2v8a2 2 0 002 2z"
+                    />
+                  </svg>
+                  Use Camera
+                </Button>
+
+                <Button
+                  onClick={() => requestAccess("screen")}
+                  className="w-full px-6 py-4 bg-gray-700 text-white shadow-lg hover:shadow-xl transition-all font-semibold tracking-wide hover:bg-gray-800 flex items-center justify-center gap-3"
+                >
+                  <svg
+                    className="w-6 h-6"
+                    fill="none"
+                    viewBox="0 0 24 24"
+                    stroke="currentColor"
+                    strokeWidth={2}
+                  >
+                    <path
+                      strokeLinecap="round"
+                      strokeLinejoin="round"
+                      d="M9.75 17L9 20l-1 1h8l-1-1-.75-3M3 13h18M5 17h14a2 2 0 002-2V5a2 2 0 00-2-2H5a2 2 0 00-2 2v10a2 2 0 002 2z"
+                    />
+                  </svg>
+                  Capture Screen
+                </Button>
+              </div>
+            )}
+
             {/* Error Actions */}
             {error && (
               <div className="animate-enter">
                 <div className="flex justify-center mb-6">
                   <Button
-                    onClick={requestWebcamAccess}
+                    onClick={() => requestAccess(selectedSource || "camera")}
                     disabled={isRequesting}
                     className="px-8 py-3 text-white shadow-lg hover:shadow-xl transition-all font-semibold tracking-wide hover:bg-[var(--mistral-orange-dark)]"
                   >
diff --git a/src/components/WelcomeScreen.tsx b/src/components/WelcomeScreen.tsx
index d69ec7e..4c58392 100644
--- a/src/components/WelcomeScreen.tsx
+++ b/src/components/WelcomeScreen.tsx
@@ -131,7 +131,7 @@ export default function WelcomeScreen({ onStart }: WelcomeScreenProps) {
                   Private & Local
                 </h4>
                 <p className="text-gray-600 leading-relaxed">
-                  Your video feed is processed locally and never sent to a
+                  Your video source (camera or screen) is processed locally and never sent to a
                   server, powered by
                   <a href="https://github.com/huggingface/transformers.js">
                     <span className="font-medium underline">
-- 
2.50.1 (Apple Git-155)

Mistral AI_ org

That's a great idea and we will indeed try to make the demo involve in that direction of having multiple sources

i was raterestricted ... see https://huggingface.co/spaces/oliveregger/Ministral_3B_WebGPU where i applied above patch, great work you have done with that example, thanks a lot!

Ready to merge
This branch is ready to get merged automatically.

Sign up or log in to comment