计算机视觉是人工智能领域的一个重要分支,它涉及到图像和视频数据的存储、处理和检索技术。在自动驾驶汽车、目标检测、机器人技术、目标跟踪等领域,计算机视觉技术扮演着至关重要的角色。
OpenCV是一个开源的计算机视觉和机器学习库,主要用于图像处理和实时数据的处理。通过OpenCV,可以对图像和视频进行处理,使得算法能够识别出汽车、交通信号、车牌、人脸甚至人类的手写体。结合其他数据分析库,OpenCV能够根据用户的需求对图像和视频进行处理。
Mediapipe是一个用于构建多模态音频、视频或时间序列数据的框架。借助Mediapipe,可以构建出令人印象深刻的机器学习管道,例如TensorFlow、TFLite等推理模型,以及媒体处理功能。
姿态估计,即从视频或实时流中估计人体姿态,在全身手势控制、量化体育锻炼和手语识别等多个领域中扮演着关键角色。例如,它可以作为健身、瑜伽和舞蹈应用的基础模型。在增强现实领域,姿态估计也占有重要地位。
Media Pipe Pose是一个用于高精度人体姿态跟踪的框架,它从RGB视频帧中获取输入,并推断出整个人身上的33个3D标志点。当前最先进的方法主要依赖于功能强大的桌面环境进行推理,而这种方法在实时性上超越了其他方法,并且取得了非常好的结果。
首先,安装所有必要的库。使用以下命令安装OpenCV-python和mediapipe:
pip install OpenCV-python
pip install mediapipe
import cv2
import mediapipe as mp
import time
mpPose = mp.solutions.pose
pose = mpPose.Pose()
mpDraw = mp.solutions.drawing_utils
cap = cv2.VideoCapture('a.mp4')
pTime = 0
while True:
success, img = cap.read()
imgRGB = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
results = pose.process(imgRGB)
if results.pose_landmarks:
mpDraw.draw_landmarks(img, results.pose_landmarks, mpPose.POSE_CONNECTIONS)
for id, lm in enumerate(results.pose_landmarks.landmark):
h, w, c = img.shape
cx, cy = int(lm.x*w), int(lm.y*h)
cv2.circle(img, (cx, cy), 5, (255,0,0), cv2.FILLED)
cTime = time.time()
fps = 1/(cTime-pTime)
pTime = cTime
cv2.putText(img, str(int(fps)), (50,50), cv2.FONT_HERSHEY_SIMPLEX,1,(255,0,0), 3)
cv2.imshow("Image", img)
cv2.waitKey(1)
import cv2
import mediapipe as mp
import time
class PoseDetector:
def __init__(self, mode = False, upBody = False, smooth=True, detectionCon = 0.5, trackCon = 0.5):
self.mode = mode
self.upBody = upBody
self.smooth = smooth
self.detectionCon = detectionCon
self.trackCon = trackCon
self.mpDraw = mp.solutions.drawing_utils
self.mpPose = mp.solutions.pose
self.pose = self.mpPose.Pose(self.mode, self.upBody, self.smooth, self.detectionCon, self.trackCon)
def findPose(self, img, draw=True):
imgRGB = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
self.results = self.pose.process(imgRGB)
if self.results.pose_landmarks:
if draw:
self.mpDraw.draw_landmarks(img, self.results.pose_landmarks, self.mpPose.POSE_CONNECTIONS)
return img
def getPosition(self, img, draw=True):
lmList= []
if self.results.pose_landmarks:
for id, lm in enumerate(self.results.pose_landmarks.landmark):
h, w, c = img.shape
cx, cy = int(lm.x * w), int(lm.y * h)
lmList.append([id, cx, cy])
if draw:
cv2.circle(img, (cx, cy), 5, (255, 0, 0), cv2.FILLED)
return lmList
def main():
cap = cv2.VideoCapture('videos/a.mp4') #make VideoCapture(0) for webcam
pTime = 0
detector = PoseDetector()
while True:
success, img = cap.read()
img = detector.findPose(img)
lmList = detector.getPosition(img)
print(lmList)
cTime = time.time()
fps = 1 / (cTime - pTime)
pTime = cTime
cv2.putText(img, str(int(fps)), (50, 50), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 0, 0), 3)
cv2.imshow("Image", img)
cv2.waitKey(1)
if __name__ == "__main__":
main()