Add depth estimation model (#366)

* add model file * add model * blur focus * blur effect * lint * readme * fix ini * focal blur * conversion * lint * fix shader
locaal-ai · Jul 4, 2023 · 04ba710 · 04ba710
1 parent 1d521f9
commit 04ba710
Show file tree

Hide file tree

Showing 9 changed files with 124 additions and 41 deletions.
diff --git a/.gitignore b/.gitignore
@@ -25,3 +25,5 @@
 
 # Exclude CMake build number cache
 /cmake/.CMakeBuildNumber
+
+src/*.generated.*
diff --git a/README.md b/README.md
@@ -16,7 +16,7 @@ A plugin for [OBS Studio](https://obsproject.com/) that allows you to replace th
   - [MacOSX](#mac-osx)
   - [Linux (Ubuntu, Arch, openSUSE)](#linux)
   - [Windows](#windows)
- 
+
 🚧 Check out our experimental [CleanStream](https://github.com/royshil/obs-cleanstream) OBS plugin for real-time filler word (uh,um) and profanity removal from live audio stream 🚧
 
 ## Download
@@ -78,14 +78,15 @@ The pretrained model weights used for portrait foreground segmentation are taken
 - https://github.com/PaddlePaddle/PaddleSeg/tree/release/2.7/contrib/PP-HumanSeg
 - https://github.com/PINTO0309/PINTO_model_zoo/tree/main/082_MediaPipe_Meet_Segmentation
 - https://github.com/PeterL1n/RobustVideoMatting
+- https://github.com/PINTO0309/PINTO_model_zoo/tree/main/384_TCMonoDepth and https://github.com/yu-li/TCMonoDepth
 
 Image enhancement (low light) models are taken from:
 - https://github.com/PINTO0309/PINTO_model_zoo/tree/main/213_TBEFN
 - https://github.com/PINTO0309/PINTO_model_zoo/tree/main/372_URetinex-Net
 - https://github.com/PINTO0309/PINTO_model_zoo/tree/main/370_Semantic-Guided-Low-Light-Image-Enhancement
 - https://github.com/PINTO0309/PINTO_model_zoo/tree/main/243_Zero-DCE-improved
 
-Some more information about how I built it: https://www.morethantechnical.com/2021/04/15/obs-plugin-for-portrait-background-removal-with-onnx-sinet-model/
+Some more information about how I built it: https://www.morethantechnical.com/2021/04/15/obs-plugin-for-portrait-background-removal-with-onnx-sinet-model/ and https://www.morethantechnical.com/2023/05/20/building-an-obs-background-removal-plugin-a-walkthrough/
 
 ## Building
 

diff --git a/data/effects/kawase_blur.effect b/data/effects/kawase_blur.effect
@@ -1,9 +1,14 @@
 uniform float4x4 ViewProj;
 uniform texture2d image;
+uniform texture2d focalmask; // focal (depth) mask
 
 uniform float xOffset;
 uniform float yOffset;
 
+uniform int   blurIter; // Current blur iteration
+uniform int   blurTotal; // Total number of blur iterations
+uniform float blurFocusPoint; // Focus point for the blur. 0 = back, 1 = front
+
 sampler_state textureSampler {
 	Filter    = Linear;
 	AddressU  = Clamp;
@@ -28,8 +33,33 @@ VertDataOut VSDefault(VertDataOut v_in)
 	return vert_out;
 }
 
-float4 PSKawaseBlur(VertDataOut v_in) : TARGET
+/**
+ * Kawase focal blur
+ * The blur amount will be based on the depth of the pixel, and the focus point.
+ * The focus point is a value between 0 and 1, where 0 is the back of the image, and 1 is the front.
+ * The blur amount is the difference between the focus point and the estimated depth of the pixel.
+ */
+float4 PSKawaseFocalBlur(VertDataOut v_in) : TARGET
 {
+	float blurIterF = float(blurIter) / float(blurTotal);
+
+	// Blur the focal map to get a smoother value else aliasing occurs
+	float blurValue = focalmask.Sample(textureSampler, v_in.uv).r;
+	blurValue += focalmask.Sample(textureSampler, v_in.uv + float2( 0.01,  0.01)).r;
+	blurValue += focalmask.Sample(textureSampler, v_in.uv + float2(-0.01,  0.01)).r;
+	blurValue += focalmask.Sample(textureSampler, v_in.uv + float2( 0.01, -0.01)).r;
+	blurValue += focalmask.Sample(textureSampler, v_in.uv + float2(-0.01, -0.01)).r;
+	blurValue *= 0.25;
+
+	// Calculate the distance from the focus point for this pixel
+	float blurFocusDistance = clamp(abs(blurValue - blurFocusPoint), 0.0, 1.0);
+
+	if (blurIterF > blurFocusDistance) {
+		// If we're past the focus point, just return the image pixel, don't blur further
+		return image.Sample(textureSampler, v_in.uv);
+	}
+
+	// Calculate the blur value from neighboring pixels
 	float4 sum = float4(0.0, 0.0, 0.0, 0.0);
 	sum += image.Sample(textureSampler, v_in.uv + float2( xOffset,  yOffset));
 	sum += image.Sample(textureSampler, v_in.uv + float2(-xOffset,  yOffset));
@@ -44,6 +74,6 @@ technique Draw
 	pass
 	{
 		vertex_shader = VSDefault(v_in);
-		pixel_shader  = PSKawaseBlur(v_in);
+		pixel_shader  = PSKawaseFocalBlur(v_in);
 	}
 }
diff --git a/data/effects/mask_alpha_filter.effect b/data/effects/mask_alpha_filter.effect
@@ -2,9 +2,6 @@ uniform float4x4 ViewProj;
 
 uniform texture2d image;     // input RGBA
 uniform texture2d alphamask; // alpha mask
-uniform int blurSize;        // Size of the image blur kernel. 0 = no blur
-uniform float xTexelSize;    // Size of texel in X coord
-uniform float yTexelSize;    // Size of texel in Y coord
 uniform texture2d blurredBackground; // input RGBA
 
 sampler_state textureSampler {
@@ -33,14 +30,8 @@ VertDataOut VSDefault(VertDataIn v_in)
 
 float4 PSAlphaMaskRGBAWithBlur(VertDataOut v_in) : TARGET
 {
-	float4 inputRGBA = image.Sample(textureSampler, v_in.uv);
-	inputRGBA.rgb = max(float3(0.0, 0.0, 0.0), inputRGBA.rgb / inputRGBA.a);
-
-	float4 outputRGBA;
-	float a = (1.0 - alphamask.Sample(textureSampler, v_in.uv).r) * inputRGBA.a;
-	outputRGBA.rgb = inputRGBA.rgb * a + blurredBackground.Sample(textureSampler, v_in.uv).rgb * (1.0 - a);
-	outputRGBA.a = 1;
-	return outputRGBA;
+	// Return the blurred image, the focal mask is already applied to the blurred image
+	return float4(blurredBackground.Sample(textureSampler, v_in.uv).rgb, 1.0);
 }
 
 float4 PSAlphaMaskRGBAWithoutBlur(VertDataOut v_in) : TARGET

diff --git a/data/locale/en-US.ini b/data/locale/en-US.ini
@@ -28,3 +28,5 @@ URETINEX="URetinex-Net"
 SGLLIE="Semantic Guided Enhancement"
 ZERODCE="Zero-DCE"
 EnableThreshold="Enable threshold"
+BlurFocusPoint="Blur focus point"
+TCMonoDepth="TCMonoDepth (Depth)"
diff --git a/data/models/tcmonodepth_tcsmallnet_192x320.onnx b/data/models/tcmonodepth_tcsmallnet_192x320.onnx
diff --git a/src/background-filter.cpp b/src/background-filter.cpp
@@ -21,6 +21,7 @@
 #include "models/ModelSelfie.h"
 #include "models/ModelRVM.h"
 #include "models/ModelPPHumanSeg.h"
+#include "models/ModelTCMonoDepth.h"
 #include "FilterData.h"
 #include "ort-utils/ort-session-utils.h"
 #include "obs-utils/obs-utils.h"
@@ -38,6 +39,7 @@ struct background_removal_filter : public filter_data {
 	int maskEveryXFrames = 1;
 	int maskEveryXFramesCount = 0;
 	int64_t blurBackground = 0;
+	float blurFocusPoint = 0.1f;
 
 	gs_effect_t *effect;
 	gs_effect_t *kawaseBlurEffect;
@@ -71,6 +73,7 @@ obs_properties_t *background_filter_properties(void *data)
 {
 	obs_properties_t *props = obs_properties_create();
 
+	/* Threshold props */
 	obs_property_t *p = obs_properties_add_bool(
 		props, "enable_threshold", obs_module_text("EnableThreshold"));
 	obs_property_set_modified_callback(p, enable_threshold_modified);
@@ -92,6 +95,7 @@ obs_properties_t *background_filter_properties(void *data)
 		props, "feather", obs_module_text("FeatherBlendSilhouette"),
 		0.0, 1.0, 0.05);
 
+	/* GPU, CPU and performance Props */
 	obs_property_t *p_use_gpu = obs_properties_add_list(
 		props, "useGPU", obs_module_text("InferenceDevice"),
 		OBS_COMBO_TYPE_LIST, OBS_COMBO_FORMAT_STRING);
@@ -111,6 +115,13 @@ obs_properties_t *background_filter_properties(void *data)
 				     USEGPU_COREML);
 #endif
 
+	obs_properties_add_int(props, "mask_every_x_frames",
+			       obs_module_text("CalculateMaskEveryXFrame"), 1,
+			       300, 1);
+	obs_properties_add_int_slider(props, "numThreads",
+				      obs_module_text("NumThreads"), 0, 8, 1);
+
+	/* Model selection Props */
 	obs_property_t *p_model_select = obs_properties_add_list(
 		props, "model_select", obs_module_text("SegmentationModel"),
 		OBS_COMBO_TYPE_LIST, OBS_COMBO_FORMAT_STRING);
@@ -128,17 +139,19 @@ obs_properties_t *background_filter_properties(void *data)
 	obs_property_list_add_string(p_model_select,
 				     obs_module_text("Robust Video Matting"),
 				     MODEL_RVM);
+	obs_property_list_add_string(p_model_select,
+				     obs_module_text("TCMonoDepth"),
+				     MODEL_DEPTH_TCMONODEPTH);
 
-	obs_properties_add_int(props, "mask_every_x_frames",
-			       obs_module_text("CalculateMaskEveryXFrame"), 1,
-			       300, 1);
-
+	/* Background Blur Props */
 	obs_properties_add_int_slider(
 		props, "blur_background",
 		obs_module_text("BlurBackgroundFactor0NoBlurUseColor"), 0, 20,
 		1);
-	obs_properties_add_int_slider(props, "numThreads",
-				      obs_module_text("NumThreads"), 0, 8, 1);
+
+	obs_properties_add_float_slider(props, "blur_focus_point",
+					obs_module_text("BlurFocusPoint"), 0.0,
+					1.0, 0.05);
 
 	UNUSED_PARAMETER(data);
 	return props;
@@ -163,6 +176,7 @@ void background_filter_defaults(obs_data_t *settings)
 	obs_data_set_default_int(settings, "mask_every_x_frames", 1);
 	obs_data_set_default_int(settings, "blur_background", 0);
 	obs_data_set_default_int(settings, "numThreads", 1);
+	obs_data_set_default_double(settings, "blur_focus_point", 0.1);
 }
 
 void background_filter_update(void *data, obs_data_t *settings)
@@ -182,6 +196,8 @@ void background_filter_update(void *data, obs_data_t *settings)
 		(int)obs_data_get_int(settings, "mask_every_x_frames");
 	tf->maskEveryXFramesCount = (int)(0);
 	tf->blurBackground = obs_data_get_int(settings, "blur_background");
+	tf->blurFocusPoint =
+		(float)obs_data_get_double(settings, "blur_focus_point");
 
 	const std::string newUseGpu = obs_data_get_string(settings, "useGPU");
 	const std::string newModel =
@@ -211,6 +227,9 @@ void background_filter_update(void *data, obs_data_t *settings)
 		if (tf->modelSelection == MODEL_PPHUMANSEG) {
 			tf->model.reset(new ModelPPHumanSeg);
 		}
+		if (tf->modelSelection == MODEL_DEPTH_TCMONODEPTH) {
+			tf->model.reset(new ModelTCMonoDepth);
+		}
 
 		createOrtSession(tf);
 	}
@@ -433,7 +452,8 @@ void background_filter_video_tick(void *data, float seconds)
 }
 
 static gs_texture_t *blur_background(struct background_removal_filter *tf,
-				     uint32_t width, uint32_t height)
+				     uint32_t width, uint32_t height,
+				     gs_texture_t *alphaTexture)
 {
 	if (tf->blurBackground == 0 || !tf->kawaseBlurEffect) {
 		return nullptr;
@@ -444,10 +464,18 @@ static gs_texture_t *blur_background(struct background_removal_filter *tf,
 			gs_texrender_get_texture(tf->texrender));
 	gs_eparam_t *image =
 		gs_effect_get_param_by_name(tf->kawaseBlurEffect, "image");
+	gs_eparam_t *focalmask =
+		gs_effect_get_param_by_name(tf->kawaseBlurEffect, "focalmask");
 	gs_eparam_t *xOffset =
 		gs_effect_get_param_by_name(tf->kawaseBlurEffect, "xOffset");
 	gs_eparam_t *yOffset =
 		gs_effect_get_param_by_name(tf->kawaseBlurEffect, "yOffset");
+	gs_eparam_t *blurIter =
+		gs_effect_get_param_by_name(tf->kawaseBlurEffect, "blurIter");
+	gs_eparam_t *blurTotal =
+		gs_effect_get_param_by_name(tf->kawaseBlurEffect, "blurTotal");
+	gs_eparam_t *blurFocusPointParam = gs_effect_get_param_by_name(
+		tf->kawaseBlurEffect, "blurFocusPoint");
 
 	for (int i = 0; i < (int)tf->blurBackground; i++) {
 		gs_texrender_reset(tf->texrender);
@@ -458,8 +486,12 @@ static gs_texture_t *blur_background(struct background_removal_filter *tf,
 		}
 
 		gs_effect_set_texture(image, blurredTexture);
+		gs_effect_set_texture(focalmask, alphaTexture);
 		gs_effect_set_float(xOffset, ((float)i + 0.5f) / (float)width);
 		gs_effect_set_float(yOffset, ((float)i + 0.5f) / (float)height);
+		gs_effect_set_int(blurIter, i);
+		gs_effect_set_int(blurTotal, (int)tf->blurBackground);
+		gs_effect_set_float(blurFocusPointParam, tf->blurFocusPoint);
 
 		struct vec4 background;
 		vec4_zero(&background);
@@ -492,22 +524,12 @@ void background_filter_video_render(void *data, gs_effect_t *_effect)
 		return;
 	}
 
-	// Output the masked image
-
-	gs_texture_t *blurredTexture = blur_background(tf, width, height);
-
 	if (!tf->effect) {
 		// Effect failed to load, skip rendering
 		obs_source_skip_video_filter(tf->source);
 		return;
 	}
 
-	if (!obs_source_process_filter_begin(tf->source, GS_RGBA,
-					     OBS_ALLOW_DIRECT_RENDERING)) {
-		obs_source_skip_video_filter(tf->source);
-		return;
-	}
-
 	gs_texture_t *alphaTexture = nullptr;
 	{
 		std::lock_guard<std::mutex> lock(tf->outputLock);
@@ -520,21 +542,26 @@ void background_filter_video_render(void *data, gs_effect_t *_effect)
 			return;
 		}
 	}
+
+	// Output the masked image
+	gs_texture_t *blurredTexture =
+		blur_background(tf, width, height, alphaTexture);
+
+	if (!obs_source_process_filter_begin(tf->source, GS_RGBA,
+					     OBS_ALLOW_DIRECT_RENDERING)) {
+		obs_source_skip_video_filter(tf->source);
+		gs_texture_destroy(alphaTexture);
+		gs_texture_destroy(blurredTexture);
+		return;
+	}
+
 	gs_eparam_t *alphamask =
 		gs_effect_get_param_by_name(tf->effect, "alphamask");
-	gs_eparam_t *blurSize =
-		gs_effect_get_param_by_name(tf->effect, "blurSize");
-	gs_eparam_t *xTexelSize =
-		gs_effect_get_param_by_name(tf->effect, "xTexelSize");
-	gs_eparam_t *yTexelSize =
-		gs_effect_get_param_by_name(tf->effect, "yTexelSize");
 	gs_eparam_t *blurredBackground =
 		gs_effect_get_param_by_name(tf->effect, "blurredBackground");
 
 	gs_effect_set_texture(alphamask, alphaTexture);
-	gs_effect_set_int(blurSize, (int)tf->blurBackground);
-	gs_effect_set_float(xTexelSize, 1.0f / (float)width);
-	gs_effect_set_float(yTexelSize, 1.0f / (float)height);
+
 	if (tf->blurBackground > 0) {
 		gs_effect_set_texture(blurredBackground, blurredTexture);
 	}

diff --git a/src/consts.h b/src/consts.h
@@ -11,6 +11,8 @@ const char *const MODEL_ENHANCE_URETINEX = "models/uretinex_net_180x320.onnx";
 const char *const MODEL_ENHANCE_SGLLIE =
 	"models/semantic_guided_llie_180x324.onnx";
 const char *const MODEL_ENHANCE_ZERODCE = "models/zero_dce_180x320.onnx";
+const char *const MODEL_DEPTH_TCMONODEPTH =
+	"models/tcmonodepth_tcsmallnet_192x320.onnx";
 
 const char *const USEGPU_CPU = "cpu";
 const char *const USEGPU_DML = "dml";

diff --git a/src/models/ModelTCMonoDepth.h b/src/models/ModelTCMonoDepth.h
@@ -0,0 +1,28 @@
+#ifndef MODELTCMONODEPTH_H
+#define MODELTCMONODEPTH_H
+
+#include "Model.h"
+
+class ModelTCMonoDepth : public ModelBCHW {
+private:
+	/* data */
+public:
+	ModelTCMonoDepth(/* args */) {}
+	~ModelTCMonoDepth() {}
+
+	virtual void prepareInputToNetwork(cv::Mat &resizedImage,
+					   cv::Mat &preprocessedImage)
+	{
+		// Do not normalize from [0, 255] to [0, 1].
+
+		hwc_to_chw(resizedImage, preprocessedImage);
+	}
+
+	virtual void postprocessOutput(cv::Mat &outputImage)
+	{
+		cv::normalize(outputImage, outputImage, 1.0, 0.0,
+			      cv::NORM_MINMAX);
+	}
+};
+
+#endif // MODELTCMONODEPTH_H
Original file line number	Diff line number	Diff line change
Expand Up		@@ -25,3 +25,5 @@

		# Exclude CMake build number cache
		/cmake/.CMakeBuildNumber

		src/.generated.