visit
In the previous article, I discussed whether it is possible to use machine learning (in particular, face and mask detection) in the browser, approaches to detection, and optimization of all processes.
Today I want to give the technical details of the implementation.Total:
Features:
navigator
objectlocation
objectXMLHttpRequest
setTimeout()
/
clearTimeout()
и
setInterval()
/
clearInterval()
Limitations:
To provide communication between the main thread and the web workers
postMessage
and onmessage
the event handler is used.worker.postMessage(message, [transfer]);
The web worker does not have access to the DOM, so you cannot use canvas directly.
OffscreenCanvas
comes to the rescue.Advantages:
requestAnimationFrame
allows you to receive images from the stream with maximum performance (60 FPS), and it is only limited by the camera's capability, not all cameras send video with such frequency.The main advantages are:
In web workers, it is impossible to know how much memory it actually consumes (
performance.memory
does not work in web workers).Based on this, we provided for the launch of our application through web workers and completely in the main thread. By running all our detection models on the main thread, we can take the memory consumption metrics and see where the memory leak is and fix it.For working with web workers initially, was used. It's a very handy library that allows you to work with the worker as a class object without using the
onmessage
and postMessage
methods and control the asynchronous code using async-await. All this was convenient until the application was launched on a tablet (Samsung Galaxy Tab S7), and suddenly it crashed after 2 minutes.After analyzing all the code, no memory leaks were found, except for a black box of this library for working with workers. For some reason, the launched Tensorflow.js models were not cleared and stored somewhere inside this library.It was decided to use a , which allows you to work with web workers from pure js without unnecessary layers. And this solved the problem; the application works for days without crashes.this.faceDetectionWorker = workers.FaceRgbDetectionWorkerFactory.createWebWorker();
this.faceDetectionWorker.onmessage = async (event) => {
if (event.data.type === 'load') {
this.faceDetectionWorker.postMessage({
type: 'init',
backend,
streamSettings,
faceDetectionSettings,
imageRatio: this.imageRatio,
});
} else if (event.data.type === 'init') {
this.isFaceWorkerInit = event.data.status;
// When both workers inited it is run processes to grab and process frames only
if (this.isFaceWorkerInit && this.isMaskWorkerInit) {
await this.grabFrame();
}
} else if (event.data.type === 'faceResults') {
this.onFaceDetected(event);
} else {
throw new Error(`Type=${event.data.type} is not supported by RgbVideo for FaceRgbDatectionWorker`);
}
};
this.faceDetectionWorker.postMessage(
{
type: 'detectFace',
originalImageToProcess: this.lastImage,
lastIndex: lastItem!.index,
},
[this.lastImage], // transferable object
);
export const init = async (data) => {
const { backend, streamSettings, faceDetectionSettings, imageRatio } = data;
flipHorizontal = streamSettings.flipHorizontal;
faceMinWidth = faceDetectionSettings.faceMinWidth;
faceMinWidthConversionFactor = faceDetectionSettings.faceMinWidthConversionFactor;
predictionIOU = faceDetectionSettings.predictionIOU;
recommendedLocation = faceDetectionSettings.useRecommendedLocation ? faceDetectionSettings.recommendedLocation : null;
detectedFaceThumbnailSize = faceDetectionSettings.detectedFaceThumbnailSize;
srcImageRatio = imageRatio;
await tfc.setBackend(backend);
await tfc.ready();
const [blazeModel] = await Promise.all([
blazeface.load({
// The maximum number of faces returned by the model
maxFaces: faceDetectionSettings.maxFaces,
// The width of the input image
inputWidth: faceDetectionSettings.faceDetectionImageMinWidth,
// The height of the input image
inputHeight: faceDetectionSettings.faceDetectionImageMinHeight,
// The threshold for deciding whether boxes overlap too much
iouThreshold: faceDetectionSettings.iouThreshold,
// The threshold for deciding when to remove boxes based on score
scoreThreshold: faceDetectionSettings.scoreThreshold,
}),
isOpenCvLoaded(),
]);
faceDetection = new FaceDetection();
originalImageToProcessCanvas = new OffscreenCanvas(srcImageRatio.videoWidth, srcImageRatio.videoHeight);
originalImageToProcessCanvasCtx = originalImageToProcessCanvas.getContext('2d');
resizedImageToProcessCanvas = new OffscreenCanvas(
srcImageRatio.faceDetectionImageWidth,
srcImageRatio.faceDetectionImageHeight,
);
resizedImageToProcessCanvasCtx = resizedImageToProcessCanvas.getContext('2d');
return blazeModel;
};
The
isOpenCvLoaded
method is waiting for openCV to loadexport const isOpenCvLoaded = () => {
let timeoutId;
const resolveOpenCvPromise = (resolve) => {
if (timeoutId) {
clearTimeout(timeoutId);
}
try {
// eslint-disable-next-line no-undef
if (cv && cv.Mat) {
return resolve();
} else {
timeoutId = setTimeout(() => {
resolveOpenCvPromise(resolve);
}, OpenCvLoadedTimeoutInMs);
}
} catch {
timeoutId = setTimeout(() => {
resolveOpenCvPromise(resolve);
}, OpenCvLoadedTimeoutInMs);
}
};
return new Promise((resolve) => {
resolveOpenCvPromise(resolve);
});
};
export const detectFace = async (data, faceModel) => {
let { originalImageToProcess, lastIndex } = data;
const facesThumbnailsImageData = [];
// Resize original image to the recommended BlazeFace resolution
resizedImageToProcessCanvasCtx.drawImage(
originalImageToProcess,
0,
0,
srcImageRatio.faceDetectionImageWidth,
srcImageRatio.faceDetectionImageHeight,
);
// Getting resized image
let resizedImageDataToProcess = resizedImageToProcessCanvasCtx.getImageData(
0,
0,
srcImageRatio.faceDetectionImageWidth,
srcImageRatio.faceDetectionImageHeight,
);
// Detect faces by BlazeFace
let predictions = await faceModel.estimateFaces(
// The image to classify. Can be a tensor, DOM element image, video, or canvas
resizedImageDataToProcess,
// Whether to return tensors as opposed to values
returnTensors,
// Whether to flip/mirror the facial keypoints horizontally. Should be true for videos that are flipped by default (e.g. webcams)
flipHorizontal,
// Whether to annotate bounding boxes with additional properties such as landmarks and probability. Pass in `false` for faster inference if annotations are not needed
annotateBoxes,
);
// Normalize predictions
predictions = faceDetection.normalizePredictions(
predictions,
returnTensors,
annotateBoxes,
srcImageRatio.faceDetectionImageRatio,
);
// Filters initial predictions by the criteri that all landmarks should be in area of interest
predictions = faceDetection.filterPredictionsByFullLandmarks(
predictions,
srcImageRatio.videoWidth,
srcImageRatio.videoHeight,
);
// Filters predictions by min face width
predictions = faceDetection.filterPredictionsByMinWidth(predictions, faceMinWidth, faceMinWidthConversionFactor);
// Filters predictions by recommended location
predictions = faceDetection.filterPredictionsByRecommendedLocation(predictions, predictionIOU, recommendedLocation);
// If there are any predictions it is started faces thumbnails extraction according to the configured size
if (predictions && predictions.length > 0) {
// Draw initial original image
originalImageToProcessCanvasCtx.drawImage(originalImageToProcess, 0, 0);
const originalImageDataToProcess = originalImageToProcessCanvasCtx.getImageData(
0,
0,
originalImageToProcess.width,
originalImageToProcess.height,
);
// eslint-disable-next-line no-undef
let srcImageData = cv.matFromImageData(originalImageDataToProcess);
try {
for (let i = 0; i < predictions.length; i++) {
const prediction = predictions[i];
const facesOriginalLandmarks = JSON.parse(JSON.stringify(prediction.originalLandmarks));
if (flipHorizontal) {
for (let j = 0; j < facesOriginalLandmarks.length; j++) {
facesOriginalLandmarks[j][0] = srcImageRatio.videoWidth - facesOriginalLandmarks[j][0];
}
}
// eslint-disable-next-line no-undef
let dstImageData = new cv.Mat();
try {
// eslint-disable-next-line no-undef
let thumbnailSize = new cv.Size(detectedFaceThumbnailSize, detectedFaceThumbnailSize);
let transformation = getOneToOneFaceTransformationByTarget(detectedFaceThumbnailSize);
// eslint-disable-next-line no-undef
let similarityTransformation = getSimilarityTransformation(facesOriginalLandmarks, transformation);
// eslint-disable-next-line no-undef
let similarityTransformationMatrix = cv.matFromArray(3, 3, cv.CV_64F, similarityTransformation.data);
try {
// eslint-disable-next-line no-undef
cv.warpPerspective(
srcImageData,
dstImageData,
similarityTransformationMatrix,
thumbnailSize,
cv.INTER_LINEAR,
cv.BORDER_CONSTANT,
new cv.Scalar(127, 127, 127, 255),
);
facesThumbnailsImageData.push(
new ImageData(
new Uint8ClampedArray(dstImageData.data, dstImageData.cols, dstImageData.rows),
detectedFaceThumbnailSize,
detectedFaceThumbnailSize,
),
);
} finally {
similarityTransformationMatrix.delete();
similarityTransformationMatrix = null;
}
} finally {
dstImageData.delete();
dstImageData = null;
}
}
} finally {
srcImageData.delete();
srcImageData = null;
}
}
return { resizedImageDataToProcess, predictions, facesThumbnailsImageData, lastIndex };
};
Calling the
faceModel.estimateFaces
method starts the image analysis using blazeface, and the predicted coordinates with the coordinates of the face, nose, ears, eyes, mouth area are returned to the main thread.Before working with them, you need to restore the coordinates for the original image because we compressed it to 128 px.Now you can use these data to decide whether the face is in the desired area or not. What is the minimum face size you need for subsequent identification.Model initialization and
webAssembly
backendexport const init = async (data) => {
const { backend, streamSettings, maskDetectionsSettings, imageRatio } = data;
flipHorizontal = streamSettings.flipHorizontal;
detectedMaskThumbnailSize = maskDetectionsSettings.detectedMaskThumbnailSize;
srcImageRatio = imageRatio;
await tfc.setBackend(backend);
await tfc.ready();
const [maskModel] = await Promise.all([
tfconv.loadGraphModel(
`/rgb_mask_classification_first/MobileNetV${maskDetectionsSettings.mobileNetVersion}_${maskDetectionsSettings.mobileNetWeight}/${maskDetectionsSettings.mobileNetType}/model.json`,
),
]);
detectedMaskThumbnailCanvas = new OffscreenCanvas(detectedMaskThumbnailSize, detectedMaskThumbnailSize);
detectedMaskThumbnailCanvasCtx = detectedMaskThumbnailCanvas.getContext('2d');
return maskModel;
};
this.maskDetectionWorker.postMessage({
type: 'detectMask',
prediction: lastItem!.data.predictions[0],
imageDataToProcess,
lastIndex: lastItem!.index,
});
export const detectMask = async (data, maskModel) => {
let { prediction, imageDataToProcess, lastIndex } = data;
const masksScores = [];
const maskLandmarks = JSON.parse(JSON.stringify(prediction.landmarks));
if (flipHorizontal) {
for (let j = 0; j < maskLandmarks.length; j++) {
maskLandmarks[j][0] = srcImageRatio.faceDetectionImageWidth - maskLandmarks[j][0];
}
}
// Draw thumbnail with mask
detectedMaskThumbnailCanvasCtx.putImageData(imageDataToProcess, 0, 0);
// Detect mask via NN
let predictionTensor = tfc.tidy(() => {
let maskDetectionSnapshotFromPixels = tfc.browser.fromPixels(detectedMaskThumbnailCanvas);
let maskDetectionSnapshotFromPixelsFlot32 = tfc.cast(maskDetectionSnapshotFromPixels, 'float32');
let expandedDims = maskDetectionSnapshotFromPixelsFlot32.expandDims(0);
return maskModel.predict(expandedDims);
});
// Put mask detection result into the returned array
try {
masksScores.push(predictionTensor.dataSync()[0].toFixed(4));
} finally {
predictionTensor.dispose();
predictionTensor = null;
}
return {
masksScores,
lastIndex,
};
};
The result of the neural network is the probability that there is a mask, which is returned from the worker. It helps to increase and decrease the threshold of mask detection. By
lastIndex
, we can compare the face and the presence of a mask and display some information on a specific person on the screen.