""" gaze_pinch_v0.py — Vision-Pro-style webcam controller, v0 (raw feel test)

What it does:

  • Tracks your iris (gaze) from the built-in webcam -> moves the macOS cursor.
  • Detects a thumb+index PINCH -> left click.
  • 9-point calibration maps your raw gaze ratios to screen coordinates.

This is a DELIBERATELY RAW prototype. The gaze cursor WILL jitter — that is the point. Feeling that jitter is what motivates the v1 "snap-to-UI-element" design.

Requirements (run once): pip install mediapipe opencv-python pyautogui numpy

macOS permissions (System Settings > Privacy & Security):

  • Camera: allow Terminal (or whatever runs python)
  • Accessibility: allow Terminal <-- REQUIRED or the cursor won't move

Run: python gaze_pinch_v0.py

Keys: During calibration: look at the dot, press SPACE to capture (9 times) During run: 'q' or ESC to quit, 'c' to recalibrate """

import time import math import numpy as np import cv2 import mediapipe as mp import pyautogui

----------------------------- tuning knobs --------------------------------

PINCH_ON = 0.45 # pinch detected when (thumb-index dist / hand size) < this PINCH_OFF = 0.60 # release threshold (hysteresis to avoid flicker) CLICK_COOLDOWN = 0.40 # seconds between clicks CALIB_SAMPLES = 12 # gaze samples averaged per calibration point EURO_MIN_CUTOFF = 0.8 # lower = smoother but laggier EURO_BETA = 0.012 # higher = more responsive to fast movement CAM_INDEX = 0

---------------------------------------------------------------------------

pyautogui.FAILSAFE = False pyautogui.PAUSE = 0.0 SCREEN_W, SCREEN_H = pyautogui.size()

MediaPipe FaceMesh iris landmark indices (refine_landmarks=True)

L_IRIS, R_IRIS = 468, 473 L_EYE_OUT, L_EYE_IN = 33, 133 R_EYE_IN, R_EYE_OUT = 362, 263 L_LID_TOP, L_LID_BOT = 159, 145 R_LID_TOP, R_LID_BOT = 386, 374

Hand landmark indices

WRIST, THUMB_TIP, INDEX_TIP, INDEX_MCP = 0, 4, 8, 5

--------------------------- One Euro filter -------------------------------

class OneEuro: def init(self, min_cutoff=1.0, beta=0.0, d_cutoff=1.0): self.min_cutoff, self.beta, self.d_cutoff = min_cutoff, beta, d_cutoff self.x_prev = None self.dx_prev = 0.0 self.t_prev = None

@staticmethod def _alpha(cutoff, dt): tau = 1.0 / (2 * math.pi * cutoff) return 1.0 / (1.0 + tau / dt) def __call__(self, x, t): if self.x_prev is None: self.x_prev, self.t_prev = x, t return x dt = max(1e-3, t - self.t_prev) dx = (x - self.x_prev) / dt a_d = self._alpha(self.d_cutoff, dt) dx_hat = a_d * dx + (1 - a_d) * self.dx_prev cutoff = self.min_cutoff + self.beta * abs(dx_hat) a = self._alpha(cutoff, dt) x_hat = a * x + (1 - a) * self.x_prev self.x_prev, self.dx_prev, self.t_prev = x_hat, dx_hat, t return x_hat

def _ratio(val, lo, hi): if abs(hi - lo) < 1e-6: return 0.5 return float(np.clip((val - lo) / (hi - lo), 0.0, 1.0))

def gaze_ratio(lm): """Return (h, v) in roughly 0..1 from iris position within the eyes. Calibration absorbs the exact semantics; we just need stable features.""" def eye_h(iris, outer, inner): lo, hi = sorted([lm[outer].x, lm[inner].x]) return _ratio(lm[iris].x, lo, hi)

def eye_v(iris, top, bot): lo, hi = sorted([lm[top].y, lm[bot].y]) return _ratio(lm[iris].y, lo, hi) h = (eye_h(L_IRIS, L_EYE_OUT, L_EYE_IN) + eye_h(R_IRIS, R_EYE_OUT, R_EYE_IN)) / 2 v = (eye_v(L_IRIS, L_LID_TOP, L_LID_BOT) + eye_v(R_IRIS, R_LID_TOP, R_LID_BOT)) / 2 return h, v

def quad_features(h, v): return np.array([1.0, h, v, h * v, h * h, v * v], dtype=np.float64)

def fit_map(samples, targets): """samples: list of (h, v). targets: list of (sx, sy). Least-squares quadratic.""" A = np.array([quad_features(h, v) for (h, v) in samples]) X = np.array([t[0] for t in targets]) Y = np.array([t[1] for t in targets]) cx, *_ = np.linalg.lstsq(A, X, rcond=None) cy, *_ = np.linalg.lstsq(A, Y, rcond=None) return cx, cy

def apply_map(cx, cy, h, v): f = quad_features(h, v) return float(f @ cx), float(f @ cy)

def pinch_strength(hand_lm): """Normalized thumb-index distance (smaller = pinching).""" def d(a, b): return math.hypot(hand_lm[a].x - hand_lm[b].x, hand_lm[a].y - hand_lm[b].y) hand_size = d(WRIST, INDEX_MCP) + 1e-6 return d(THUMB_TIP, INDEX_TIP) / hand_size

------------------------------- main --------------------------------------

def main(): mp_face = mp.solutions.face_mesh mp_hands = mp.solutions.hands face = mp_face.FaceMesh(refine_landmarks=True, max_num_faces=1, min_detection_confidence=0.6, min_tracking_confidence=0.6) hands = mp_hands.Hands(max_num_hands=1, min_detection_confidence=0.6, min_tracking_confidence=0.6)

cap = cv2.VideoCapture(CAM_INDEX) if not cap.isOpened(): raise SystemExit("Cannot open webcam. Check camera permission / CAM_INDEX.") # ---------- calibration ---------- pts = [(0.1, 0.1), (0.5, 0.1), (0.9, 0.1), (0.1, 0.5), (0.5, 0.5), (0.9, 0.5), (0.1, 0.9), (0.5, 0.9), (0.9, 0.9)] win = "calibration" cv2.namedWindow(win, cv2.WND_PROP_FULLSCREEN) cv2.setWindowProperty(win, cv2.WND_PROP_FULLSCREEN, cv2.WINDOW_FULLSCREEN) samples, targets = [], [] for (px, py) in pts: tx, ty = int(px * SCREEN_W), int(py * SCREEN_H) captured = None while captured is None: ok, frame = cap.read() if not ok: continue frame = cv2.flip(frame, 1) rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) res = face.process(rgb) canvas = np.zeros((SCREEN_H, SCREEN_W, 3), dtype=np.uint8) cv2.circle(canvas, (tx, ty), 18, (0, 0, 255), -1) cv2.circle(canvas, (tx, ty), 6, (255, 255, 255), -1) cv2.putText(canvas, "Look at the dot, press SPACE", (60, 60), cv2.FONT_HERSHEY_SIMPLEX, 1.0, (200, 200, 200), 2) cv2.imshow(win, canvas) key = cv2.waitKey(1) & 0xFF if key == 27: cap.release(); cv2.destroyAllWindows(); return if key == 32 and res.multi_face_landmarks: lm = res.multi_face_landmarks[0].landmark acc = np.zeros(2) for _ in range(CALIB_SAMPLES): ok2, f2 = cap.read() if not ok2: continue f2 = cv2.flip(f2, 1) r2 = face.process(cv2.cvtColor(f2, cv2.COLOR_BGR2RGB)) if r2.multi_face_landmarks: acc += np.array(gaze_ratio(r2.multi_face_landmarks[0].landmark)) captured = (acc / CALIB_SAMPLES).tolist() samples.append(tuple(captured)); targets.append((tx, ty)) cx, cy = fit_map(samples, targets) cv2.destroyWindow(win) # ---------- run loop ---------- fx, fy = OneEuro(EURO_MIN_CUTOFF, EURO_BETA), OneEuro(EURO_MIN_CUTOFF, EURO_BETA) pinching = False last_click = 0.0 print("Running. 'q'/ESC to quit, 'c' to recalibrate.") while True: ok, frame = cap.read() if not ok: continue frame = cv2.flip(frame, 1) rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) fres = face.process(rgb) hres = hands.process(rgb) t = time.time() # gaze -> cursor if fres.multi_face_landmarks: h, v = gaze_ratio(fres.multi_face_landmarks[0].landmark) sx, sy = apply_map(cx, cy, h, v) sx = fx(sx, t); sy = fy(sy, t) sx = int(np.clip(sx, 0, SCREEN_W - 1)) sy = int(np.clip(sy, 0, SCREEN_H - 1)) pyautogui.moveTo(sx, sy) # pinch -> click (edge-triggered with hysteresis + cooldown) if hres.multi_hand_landmarks: s = pinch_strength(hres.multi_hand_landmarks[0].landmark) if not pinching and s < PINCH_ON: pinching = True if t - last_click > CLICK_COOLDOWN: pyautogui.click() last_click = t elif pinching and s > PINCH_OFF: pinching = False cv2.putText(frame, f"pinch {s:.2f}", (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0, 255, 0) if pinching else (200, 200, 200), 2) cv2.imshow("preview (q to quit)", frame) key = cv2.waitKey(1) & 0xFF if key in (ord('q'), 27): break if key == ord('c'): cap.release(); cv2.destroyAllWindows() return main() # quick & dirty recalibrate cap.release() cv2.destroyAllWindows()

if name == "main": main()