import cv2 import streamlit as st import tempfile import numpy as np from face_detection import FaceDetector from mark_detection import MarkDetector from pose_estimation import PoseEstimator from utils import refine from PIL import Image st.title("Head Pose Estimation") st.text("Just a heads up (pun intended)... The code used for this space is largely borrowed from https://github.com/yinguobing/head-pose-estimation. Slightly altered to fit image needs and make it work on huggingface.") # Choose between Image or Video file upload file_type = st.selectbox("Choose the type of file you want to upload", ("Image", "Video")) uploaded_file = st.file_uploader( "Upload an image or video file of your face", type=["jpg", "jpeg", "png", "mp4", "mov", "avi", "mkv"] ) # Display placeholder for real-time video output FRAME_WINDOW = st.image([]) if uploaded_file is not None: # Video processing if file_type == "Video": tfile = tempfile.NamedTemporaryFile(delete=False) tfile.write(uploaded_file.read()) cap = cv2.VideoCapture(tfile.name) st.write(f"Video source: {tfile.name}") # Getting frame sizes frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) # Initialize face detection, landmark detection, and pose estimation models face_detector = FaceDetector("assets/face_detector.onnx") mark_detector = MarkDetector("assets/face_landmarks.onnx") pose_estimator = PoseEstimator(frame_width, frame_height) # Process each frame while cap.isOpened(): ret, frame = cap.read() if not ret: break # Step 1: Detect faces in the frame faces, _ = face_detector.detect(frame, 0.7) # If a face is detected, proceed with pose estimation if len(faces) > 0: # Detect landmarks for the first face face = refine(faces, frame_width, frame_height, 0.15)[0] x1, y1, x2, y2 = face[:4].astype(int) patch = frame[y1:y2, x1:x2] # Run landmark detection and convert local face area to global image marks = mark_detector.detect([patch])[0].reshape([68, 2]) marks *= (x2 - x1) marks[:, 0] += x1 marks[:, 1] += y1 # Pose estimation with the detected landmarks pose = pose_estimator.solve(marks) # Draw the pose on the frame pose_estimator.visualize(frame, pose, color=(0, 255, 0)) # Convert frame to RGB for Streamlit display frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) FRAME_WINDOW.image(frame_rgb) cap.release() # Image processing elif file_type == "Image": # Load and process uploaded image image = np.array(Image.open(uploaded_file)) frame_height, frame_width, _ = image.shape # Initialize models for detection and pose estimation face_detector = FaceDetector("assets/face_detector.onnx") mark_detector = MarkDetector("assets/face_landmarks.onnx") pose_estimator = PoseEstimator(frame_width, frame_height) # Detect face and landmarks faces, _ = face_detector.detect(image, 0.7) if len(faces) > 0: face = refine(faces, frame_width, frame_height, 0.15)[0] x1, y1, x2, y2 = face[:4].astype(int) patch = image[y1:y2, x1:x2] # Detect landmarks and map them to global image coordinates marks = mark_detector.detect([patch])[0].reshape([68, 2]) marks *= (x2 - x1) marks[:, 0] += x1 marks[:, 1] += y1 # Estimate pose and visualize on image pose = pose_estimator.solve(marks) pose_estimator.visualize(image, pose, color=(0, 255, 0)) # Convert image to RGB and display in Streamlit image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) st.image(image_rgb, caption="Pose Estimated Image", use_column_width=True)