""" gaze_pinch_v0.py — Vision-Pro-style webcam controller, v0 (raw feel test)
What it does:
- Tracks your iris (gaze) from the built-in webcam -> moves the macOS cursor.
- Detects a thumb+index PINCH -> left click.
- 9-point calibration maps your raw gaze ratios to screen coordinates.
This is a DELIBERATELY RAW prototype. The gaze cursor WILL jitter — that is the point. Feeling that jitter is what motivates the v1 "snap-to-UI-element" design.
Requirements (run once): pip install mediapipe opencv-python pyautogui numpy
macOS permissions (System Settings > Privacy & Security):
- Camera: allow Terminal (or whatever runs python)
- Accessibility: allow Terminal <-- REQUIRED or the cursor won't move
Run: python gaze_pinch_v0.py
Keys: During calibration: look at the dot, press SPACE to capture (9 times) During run: 'q' or ESC to quit, 'c' to recalibrate """
import time import math import numpy as np import cv2 import mediapipe as mp import pyautogui
----------------------------- tuning knobs --------------------------------
PINCH_ON = 0.45 # pinch detected when (thumb-index dist / hand size) < this PINCH_OFF = 0.60 # release threshold (hysteresis to avoid flicker) CLICK_COOLDOWN = 0.40 # seconds between clicks CALIB_SAMPLES = 12 # gaze samples averaged per calibration point EURO_MIN_CUTOFF = 0.8 # lower = smoother but laggier EURO_BETA = 0.012 # higher = more responsive to fast movement CAM_INDEX = 0
---------------------------------------------------------------------------
pyautogui.FAILSAFE = False pyautogui.PAUSE = 0.0 SCREEN_W, SCREEN_H = pyautogui.size()
MediaPipe FaceMesh iris landmark indices (refine_landmarks=True)
L_IRIS, R_IRIS = 468, 473 L_EYE_OUT, L_EYE_IN = 33, 133 R_EYE_IN, R_EYE_OUT = 362, 263 L_LID_TOP, L_LID_BOT = 159, 145 R_LID_TOP, R_LID_BOT = 386, 374
Hand landmark indices
WRIST, THUMB_TIP, INDEX_TIP, INDEX_MCP = 0, 4, 8, 5
--------------------------- One Euro filter -------------------------------
class OneEuro: def init(self, min_cutoff=1.0, beta=0.0, d_cutoff=1.0): self.min_cutoff, self.beta, self.d_cutoff = min_cutoff, beta, d_cutoff self.x_prev = None self.dx_prev = 0.0 self.t_prev = None
@staticmethod
def _alpha(cutoff, dt):
tau = 1.0 / (2 * math.pi * cutoff)
return 1.0 / (1.0 + tau / dt)
def __call__(self, x, t):
if self.x_prev is None:
self.x_prev, self.t_prev = x, t
return x
dt = max(1e-3, t - self.t_prev)
dx = (x - self.x_prev) / dt
a_d = self._alpha(self.d_cutoff, dt)
dx_hat = a_d * dx + (1 - a_d) * self.dx_prev
cutoff = self.min_cutoff + self.beta * abs(dx_hat)
a = self._alpha(cutoff, dt)
x_hat = a * x + (1 - a) * self.x_prev
self.x_prev, self.dx_prev, self.t_prev = x_hat, dx_hat, t
return x_hat
def _ratio(val, lo, hi): if abs(hi - lo) < 1e-6: return 0.5 return float(np.clip((val - lo) / (hi - lo), 0.0, 1.0))
def gaze_ratio(lm): """Return (h, v) in roughly 0..1 from iris position within the eyes. Calibration absorbs the exact semantics; we just need stable features.""" def eye_h(iris, outer, inner): lo, hi = sorted([lm[outer].x, lm[inner].x]) return _ratio(lm[iris].x, lo, hi)
def eye_v(iris, top, bot):
lo, hi = sorted([lm[top].y, lm[bot].y])
return _ratio(lm[iris].y, lo, hi)
h = (eye_h(L_IRIS, L_EYE_OUT, L_EYE_IN) + eye_h(R_IRIS, R_EYE_OUT, R_EYE_IN)) / 2
v = (eye_v(L_IRIS, L_LID_TOP, L_LID_BOT) + eye_v(R_IRIS, R_LID_TOP, R_LID_BOT)) / 2
return h, v
def quad_features(h, v): return np.array([1.0, h, v, h * v, h * h, v * v], dtype=np.float64)
def fit_map(samples, targets): """samples: list of (h, v). targets: list of (sx, sy). Least-squares quadratic.""" A = np.array([quad_features(h, v) for (h, v) in samples]) X = np.array([t[0] for t in targets]) Y = np.array([t[1] for t in targets]) cx, *_ = np.linalg.lstsq(A, X, rcond=None) cy, *_ = np.linalg.lstsq(A, Y, rcond=None) return cx, cy
def apply_map(cx, cy, h, v): f = quad_features(h, v) return float(f @ cx), float(f @ cy)
def pinch_strength(hand_lm): """Normalized thumb-index distance (smaller = pinching).""" def d(a, b): return math.hypot(hand_lm[a].x - hand_lm[b].x, hand_lm[a].y - hand_lm[b].y) hand_size = d(WRIST, INDEX_MCP) + 1e-6 return d(THUMB_TIP, INDEX_TIP) / hand_size
------------------------------- main --------------------------------------
def main(): mp_face = mp.solutions.face_mesh mp_hands = mp.solutions.hands face = mp_face.FaceMesh(refine_landmarks=True, max_num_faces=1, min_detection_confidence=0.6, min_tracking_confidence=0.6) hands = mp_hands.Hands(max_num_hands=1, min_detection_confidence=0.6, min_tracking_confidence=0.6)
cap = cv2.VideoCapture(CAM_INDEX)
if not cap.isOpened():
raise SystemExit("Cannot open webcam. Check camera permission / CAM_INDEX.")
# ---------- calibration ----------
pts = [(0.1, 0.1), (0.5, 0.1), (0.9, 0.1),
(0.1, 0.5), (0.5, 0.5), (0.9, 0.5),
(0.1, 0.9), (0.5, 0.9), (0.9, 0.9)]
win = "calibration"
cv2.namedWindow(win, cv2.WND_PROP_FULLSCREEN)
cv2.setWindowProperty(win, cv2.WND_PROP_FULLSCREEN, cv2.WINDOW_FULLSCREEN)
samples, targets = [], []
for (px, py) in pts:
tx, ty = int(px * SCREEN_W), int(py * SCREEN_H)
captured = None
while captured is None:
ok, frame = cap.read()
if not ok:
continue
frame = cv2.flip(frame, 1)
rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
res = face.process(rgb)
canvas = np.zeros((SCREEN_H, SCREEN_W, 3), dtype=np.uint8)
cv2.circle(canvas, (tx, ty), 18, (0, 0, 255), -1)
cv2.circle(canvas, (tx, ty), 6, (255, 255, 255), -1)
cv2.putText(canvas, "Look at the dot, press SPACE", (60, 60),
cv2.FONT_HERSHEY_SIMPLEX, 1.0, (200, 200, 200), 2)
cv2.imshow(win, canvas)
key = cv2.waitKey(1) & 0xFF
if key == 27:
cap.release(); cv2.destroyAllWindows(); return
if key == 32 and res.multi_face_landmarks:
lm = res.multi_face_landmarks[0].landmark
acc = np.zeros(2)
for _ in range(CALIB_SAMPLES):
ok2, f2 = cap.read()
if not ok2:
continue
f2 = cv2.flip(f2, 1)
r2 = face.process(cv2.cvtColor(f2, cv2.COLOR_BGR2RGB))
if r2.multi_face_landmarks:
acc += np.array(gaze_ratio(r2.multi_face_landmarks[0].landmark))
captured = (acc / CALIB_SAMPLES).tolist()
samples.append(tuple(captured)); targets.append((tx, ty))
cx, cy = fit_map(samples, targets)
cv2.destroyWindow(win)
# ---------- run loop ----------
fx, fy = OneEuro(EURO_MIN_CUTOFF, EURO_BETA), OneEuro(EURO_MIN_CUTOFF, EURO_BETA)
pinching = False
last_click = 0.0
print("Running. 'q'/ESC to quit, 'c' to recalibrate.")
while True:
ok, frame = cap.read()
if not ok:
continue
frame = cv2.flip(frame, 1)
rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
fres = face.process(rgb)
hres = hands.process(rgb)
t = time.time()
# gaze -> cursor
if fres.multi_face_landmarks:
h, v = gaze_ratio(fres.multi_face_landmarks[0].landmark)
sx, sy = apply_map(cx, cy, h, v)
sx = fx(sx, t); sy = fy(sy, t)
sx = int(np.clip(sx, 0, SCREEN_W - 1))
sy = int(np.clip(sy, 0, SCREEN_H - 1))
pyautogui.moveTo(sx, sy)
# pinch -> click (edge-triggered with hysteresis + cooldown)
if hres.multi_hand_landmarks:
s = pinch_strength(hres.multi_hand_landmarks[0].landmark)
if not pinching and s < PINCH_ON:
pinching = True
if t - last_click > CLICK_COOLDOWN:
pyautogui.click()
last_click = t
elif pinching and s > PINCH_OFF:
pinching = False
cv2.putText(frame, f"pinch {s:.2f}", (10, 30),
cv2.FONT_HERSHEY_SIMPLEX, 0.8,
(0, 255, 0) if pinching else (200, 200, 200), 2)
cv2.imshow("preview (q to quit)", frame)
key = cv2.waitKey(1) & 0xFF
if key in (ord('q'), 27):
break
if key == ord('c'):
cap.release(); cv2.destroyAllWindows()
return main() # quick & dirty recalibrate
cap.release()
cv2.destroyAllWindows()
if name == "main": main()