在本文中,将探讨如何在iOS应用中集成YOLO v2模型,以实现对实时视频流中的对象进行检测。将从模型的集成开始,逐步介绍代码的修改和实现,最后给出一个实际的应用示例。
首先,需要将之前保存在ObjectDetectionDemo iOS应用的Models文件夹中的yolov2-pipeline.mlmodel复制并添加到项目文件中。
为了使用模型,需要对之前文章中的代码进行一些修改。VideoCapture类的startCapture方法需要接受并存储一个Vision框架请求参数VNRequest:
public func startCapture(_ visionRequest: VNRequest?) {
if visionRequest != nil {
self.visionRequests = [visionRequest!]
} else {
self.visionRequests = []
}
if !captureSession.isRunning {
captureSession.startRunning()
}
}
接下来,添加ObjectDetection类,并实现createObjectDetectionVisionRequest方法:
public func createObjectDetectionVisionRequest() -> VNRequest? {
do {
let model = yolov2_pipeline().model
let visionModel = try VNCoreMLModel(for: model)
let objectRecognition = VNCoreMLRequest(model: visionModel, completionHandler: { (request, error) in
DispatchQueue.main.async(execute: {
if let results = request.results {
self.processVisionRequestResults(results)
}
})
})
objectRecognition.imageCropAndScaleOption = .scaleFill
return objectRecognition
} catch let error as NSError {
print("Model loading error: \(error)")
return nil
}
}
注意,使用.scaleFill值来设置imageCropAndScaleOption。这会在将捕获的480 x 640大小的图像缩放到模型所需的416 x 416大小时引入一些轻微的失真。这不会对结果产生显著影响,但会使后续的缩放操作更简单。
self.videoCapture = VideoCapture(self.cameraView.layer)
self.objectDetection = ObjectDetection(self.cameraView.layer, videoFrameSize: self.videoCapture.getCaptureFrameSize())
let visionRequest = self.objectDetection.createObjectDetectionVisionRequest()
self.videoCapture.startCapture(visionRequest)
有了这样的框架,可以在每次捕获视频帧时执行visionRequest中定义的逻辑:
public func captureOutput(_ output: AVCaptureOutput, didOutput sampleBuffer: CMSampleBuffer, from connection: AVCaptureConnection) {
guard let pixelBuffer = CMSampleBufferGetImageBuffer(sampleBuffer) else {
return
}
let frameOrientation: CGImagePropertyOrientation = .up
let imageRequestHandler = VNImageRequestHandler(cvPixelBuffer: pixelBuffer, orientation: frameOrientation, options: [:])
do {
try imageRequestHandler.perform(self.visionRequests)
} catch {
print(error)
}
}
通过上述更改,yolov2_pipeline模型在每个捕获的帧上使用,然后检测结果传递给ObjectDetection.processVisionRequestResults方法。由于之前已经实现了pipeline模型的model_decoder和model_nms组件,因此在iOS端不需要解码逻辑。只需读取最可能的观察结果(objectObservation)并在捕获的帧上绘制相应的框(调用createBoundingBoxLayer和addSublayer方法):
private func processVisionRequestResults(_ results: [Any]) {
CATransaction.begin()
CATransaction.setValue(kCFBooleanTrue, forKey: kCATransactionDisableActions)
self.objectDetectionLayer.sublayers = nil
for observation in results where observation is VNRecognizedObjectObservation {
guard let objectObservation = observation as? VNRecognizedObjectObservation else {
continue
}
let topLabelObservation = objectObservation.labels[0]
let objectBounds = VNImageRectForNormalizedRect(
objectObservation.boundingBox,
Int(self.objectDetectionLayer.bounds.width), Int(self.objectDetectionLayer.bounds.height))
let bbLayer = self.createBoundingBoxLayer(objectBounds, identifier: topLabelObservation.identifier, confidence: topLabelObservation.confidence)
self.objectDetectionLayer.addSublayer(bbLayer)
}
CATransaction.commit()
}
绘制框相对简单,并不特定于应用的机器学习部分。主要的困难在于使用正确的比例和坐标系统:“0,0”对于模型来说意味着左上角,但对于iOS和Vision框架来说意味着左下角。
ObjectDetection类的两个方法将处理这个问题:setupObjectDetectionLayer和createBoundingBoxLayer。前者为框准备层:
private func setupObjectDetectionLayer(_ viewLayer: CALayer, _ videoFrameSize: CGSize) {
self.objectDetectionLayer = CALayer()
self.objectDetectionLayer.name = "ObjectDetectionLayer"
self.objectDetectionLayer.bounds = CGRect(x: 0.0, y: 0.0, width: videoFrameSize.width, height: videoFrameSize.height)
self.objectDetectionLayer.position = CGPoint(x: viewLayer.bounds.midX, y: viewLayer.bounds.midY)
viewLayer.addSublayer(self.objectDetectionLayer)
let bounds = viewLayer.bounds
let scale = fmax(bounds.size.width / videoFrameSize.width, bounds.size.height / videoFrameSize.height)
CATransaction.begin()
CATransaction.setValue(kCFBooleanTrue, forKey: kCATransactionDisableActions)
self.objectDetectionLayer.setAffineTransform(CGAffineTransform(scaleX: scale, y: -scale))
self.objectDetectionLayer.position = CGPoint(x: bounds.midX, y: bounds.midY)
CATransaction.commit()
}
createBoundingBoxLayer方法创建要绘制的形状:
private func createBoundingBoxLayer(_ bounds: CGRect, identifier: String, confidence: VNConfidence) -> CALayer {
let path = UIBezierPath(rect: bounds)
let boxLayer = CAShapeLayer()
boxLayer.path = path.cgPath
boxLayer.strokeColor = UIColor.red.cgColor
boxLayer.lineWidth = 2
boxLayer.fillColor = CGColor(colorSpace: CGColorSpaceCreateDeviceRGB(), components: [0.0, 0.0, 0.0, 0.0])
boxLayer.bounds = bounds
boxLayer.position = CGPoint(x: bounds.midX, y: bounds.midY)
boxLayer.name = "Detected Object Box"
boxLayer.backgroundColor = CGColor(colorSpace: CGColorSpaceCreateDeviceRGB(), components: [0.5, 0.5, 0.2, 0.3])
boxLayer.cornerRadius = 6
let textLayer = CATextLayer()
textLayer.name = "Detected Object Label"
textLayer.string = String(format: "%@\n(%.2f)", identifier, confidence)
textLayer.fontSize = CGFloat(16.0)
textLayer.bounds = CGRect(x: 0, y: 0, width: bounds.size.width - 10, height: bounds.size.height - 10)
textLayer.position = CGPoint(x: bounds.midX, y: bounds.midY)
textLayer.alignmentMode = .center
textLayer.foregroundColor = UIColor.red.cgColor
textLayer.contentsScale = 2.0
textLayer.setAffineTransform(CGAffineTransform(scaleX: 1.0, y: -1.0))
boxLayer.addSublayer(textLayer)
return boxLayer
}
恭喜,有一个工作的对象检测应用,可以在实际生活中或在案例中使用Pixels门户的免费剪辑进行测试。
请注意,YOLO v2模型对图像方向非常敏感,至少对某些对象类别(例如人类别)是这样。如果在处理之前旋转了帧,检测结果会恶化。
这可以使用Open Images数据集中的任何样本图像来说明。完全相同的图像——两个非常不同的结果。需要记住这一点,以确保正确地定向输入模型的图像。
就是这样!这是一段漫长的旅程,但终于走到了尽头。有一个工作在实时视频流中的对象检测iOS应用。