【DFL学习日记#3】S3FDExtractor.py 检测人脸边界框

LeoSasion · 2025-10-3 11:49:54

【学习日记】Extract 切脸 - Deep 换脸 - BBS_Monster
Extractor.py 包含了本文的S3FD（边界框）和FANExtractor（关键点）
本文只讲S3FD

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
S3FD人脸检测器实现

该模块实现了Single Shot Scale-invariant Face Detector (S3FD)人脸检测算法，
用于检测图像中的人脸并返回人脸边界框坐标。
"""

import operator
from pathlib import Path

import cv2
import numpy as np

from core.leras import nn

class S3FDExtractor(object):
    """
    S3FD人脸检测器提取器类

    该类封装了S3FD人脸检测模型，提供了加载模型和从图像中提取人脸边界框的功能。
    """

    def __init__(self, place_model_on_cpu=False):
        """
        初始化S3FD人脸检测器

        参数:
            place_model_on_cpu (bool): 是否将模型放在CPU上运行，默认为False（使用GPU）
        """
        # 初始化神经网络环境，使用NHWC数据格式
        nn.initialize(data_format="NHWC")
        tf = nn.tf

        # 模型路径设置
        model_path = Path(__file__).parent / "S3FD.npy"
        if not model_path.exists():
            raise Exception("Unable to load S3FD.npy")

        class L2Norm(nn.LayerBase):
            """
            L2归一化层

            对输入特征图进行L2归一化处理，并应用可学习的缩放参数
            """
            def __init__(self, n_channels, **kwargs):
                self.n_channels = n_channels
                super().__init__(**kwargs)

            def build_weights(self):
                # 创建可学习的缩放权重参数
                self.weight = tf.get_variable("weight", (1, 1, 1, self.n_channels), 
                                            dtype=nn.floatx, initializer=tf.initializers.ones)

            def get_weights(self):
                return [self.weight]

            def __call__(self, inputs):
                """
                执行L2归一化操作

                参数:
                    inputs: 输入特征图

                返回:
                    归一化并缩放后的特征图
                """
                x = inputs
                # L2归一化: x / sqrt(sum(x^2) + 1e-10) * weight
                x = x / (tf.sqrt(tf.reduce_sum(tf.pow(x, 2), axis=-1, keepdims=True)) + 1e-10) * self.weight
                return x

        class S3FD(nn.ModelBase):
            """
            S3FD人脸检测模型类

            实现了S3FD网络架构，包含特征提取和多尺度检测头
            """
            def __init__(self):
                super().__init__(name='S3FD')

            def on_build(self):
                """
                构建网络层结构

                定义VGG风格的特征提取网络和多尺度检测头
                """
                # 图像预处理：均值减法
                self.minus = tf.constant([104, 117, 123], dtype=nn.floatx)

                # VGG风格的卷积层 - 第一阶段
                self.conv1_1 = nn.Conv2D(3, 64, kernel_size=3, strides=1, padding='SAME')
                self.conv1_2 = nn.Conv2D(64, 64, kernel_size=3, strides=1, padding='SAME')

                # 第二阶段
                self.conv2_1 = nn.Conv2D(64, 128, kernel_size=3, strides=1, padding='SAME')
                self.conv2_2 = nn.Conv2D(128, 128, kernel_size=3, strides=1, padding='SAME')

                # 第三阶段
                self.conv3_1 = nn.Conv2D(128, 256, kernel_size=3, strides=1, padding='SAME')
                self.conv3_2 = nn.Conv2D(256, 256, kernel_size=3, strides=1, padding='SAME')
                self.conv3_3 = nn.Conv2D(256, 256, kernel_size=3, strides=1, padding='SAME')

                # 第四阶段
                self.conv4_1 = nn.Conv2D(256, 512, kernel_size=3, strides=1, padding='SAME')
                self.conv4_2 = nn.Conv2D(512, 512, kernel_size=3, strides=1, padding='SAME')
                self.conv4_3 = nn.Conv2D(512, 512, kernel_size=3, strides=1, padding='SAME')

                # 第五阶段
                self.conv5_1 = nn.Conv2D(512, 512, kernel_size=3, strides=1, padding='SAME')
                self.conv5_2 = nn.Conv2D(512, 512, kernel_size=3, strides=1, padding='SAME')
                self.conv5_3 = nn.Conv2D(512, 512, kernel_size=3, strides=1, padding='SAME')

                # 第六阶段 (卷积替代全连接层)
                self.fc6 = nn.Conv2D(512, 1024, kernel_size=3, strides=1, padding=3)
                self.fc7 = nn.Conv2D(1024, 1024, kernel_size=1, strides=1, padding='SAME')

                # 第七阶段
                self.conv6_1 = nn.Conv2D(1024, 256, kernel_size=1, strides=1, padding='SAME')
                self.conv6_2 = nn.Conv2D(256, 512, kernel_size=3, strides=2, padding='SAME')

                # 第八阶段
                self.conv7_1 = nn.Conv2D(512, 128, kernel_size=1, strides=1, padding='SAME')
                self.conv7_2 = nn.Conv2D(128, 256, kernel_size=3, strides=2, padding='SAME')

                # L2归一化层用于不同尺度特征
                self.conv3_3_norm = L2Norm(256)
                self.conv4_3_norm = L2Norm(512)
                self.conv5_3_norm = L2Norm(512)

                # 多尺度检测头 - 用于检测不同大小的人脸
                # conv3_3层检测头 (小尺度人脸)
                self.conv3_3_norm_mbox_conf = nn.Conv2D(256, 4, kernel_size=3, strides=1, padding='SAME')
                self.conv3_3_norm_mbox_loc = nn.Conv2D(256, 4, kernel_size=3, strides=1, padding='SAME')

                # conv4_3层检测头
                self.conv4_3_norm_mbox_conf = nn.Conv2D(512, 2, kernel_size=3, strides=1, padding='SAME')
                self.conv4_3_norm_mbox_loc = nn.Conv2D(512, 4, kernel_size=3, strides=1, padding='SAME')

                # conv5_3层检测头
                self.conv5_3_norm_mbox_conf = nn.Conv2D(512, 2, kernel_size=3, strides=1, padding='SAME')
                self.conv5_3_norm_mbox_loc = nn.Conv2D(512, 4, kernel_size=3, strides=1, padding='SAME')

                # fc7层检测头
                self.fc7_mbox_conf = nn.Conv2D(1024, 2, kernel_size=3, strides=1, padding='SAME')
                self.fc7_mbox_loc = nn.Conv2D(1024, 4, kernel_size=3, strides=1, padding='SAME')

                # conv6_2层检测头
                self.conv6_2_mbox_conf = nn.Conv2D(512, 2, kernel_size=3, strides=1, padding='SAME')
                self.conv6_2_mbox_loc = nn.Conv2D(512, 4, kernel_size=3, strides=1, padding='SAME')

                # conv7_2层检测头 (大尺度人脸)
                self.conv7_2_mbox_conf = nn.Conv2D(256, 2, kernel_size=3, strides=1, padding='SAME')
                self.conv7_2_mbox_loc = nn.Conv2D(256, 4, kernel_size=3, strides=1, padding='SAME')

            def forward(self, inp):
                """
                前向传播

                参数:
                    inp: 输入数据，包含一个图像张量

                返回:
                    各尺度的分类和回归结果列表
                """
                x, = inp
                # 图像预处理：减去均值
                x = x - self.minus

                # 第一阶段卷积和池化
                x = tf.nn.relu(self.conv1_1(x))
                x = tf.nn.relu(self.conv1_2(x))
                x = tf.nn.max_pool(x, [1, 2, 2, 1], [1, 2, 2, 1], "VALID")

                # 第二阶段卷积和池化
                x = tf.nn.relu(self.conv2_1(x))
                x = tf.nn.relu(self.conv2_2(x))
                x = tf.nn.max_pool(x, [1, 2, 2, 1], [1, 2, 2, 1], "VALID")

                # 第三阶段卷积和池化
                x = tf.nn.relu(self.conv3_1(x))
                x = tf.nn.relu(self.conv3_2(x))
                x = tf.nn.relu(self.conv3_3(x))
                f3_3 = x  # 保存特征图用于检测
                x = tf.nn.max_pool(x, [1, 2, 2, 1], [1, 2, 2, 1], "VALID")

                # 第四阶段卷积和池化
                x = tf.nn.relu(self.conv4_1(x))
                x = tf.nn.relu(self.conv4_2(x))
                x = tf.nn.relu(self.conv4_3(x))
                f4_3 = x  # 保存特征图用于检测
                x = tf.nn.max_pool(x, [1, 2, 2, 1], [1, 2, 2, 1], "VALID")

                # 第五阶段卷积和池化
                x = tf.nn.relu(self.conv5_1(x))
                x = tf.nn.relu(self.conv5_2(x))
                x = tf.nn.relu(self.conv5_3(x))
                f5_3 = x  # 保存特征图用于检测
                x = tf.nn.max_pool(x, [1, 2, 2, 1], [1, 2, 2, 1], "VALID")

                # 第六阶段卷积
                x = tf.nn.relu(self.fc6(x))
                x = tf.nn.relu(self.fc7(x))
                ffc7 = x  # 保存特征图用于检测

                # 第七阶段卷积
                x = tf.nn.relu(self.conv6_1(x))
                x = tf.nn.relu(self.conv6_2(x))
                f6_2 = x  # 保存特征图用于检测

                # 第八阶段卷积
                x = tf.nn.relu(self.conv7_1(x))
                x = tf.nn.relu(self.conv7_2(x))
                f7_2 = x  # 保存特征图用于检测

                # 特征归一化
                f3_3 = self.conv3_3_norm(f3_3)
                f4_3 = self.conv4_3_norm(f4_3)
                f5_3 = self.conv5_3_norm(f5_3)

                # 在各尺度特征图上进行检测
                # 分类分支(confidence)和回归分支(location)
                cls1 = self.conv3_3_norm_mbox_conf(f3_3)
                reg1 = self.conv3_3_norm_mbox_loc(f3_3)

                cls2 = tf.nn.softmax(self.conv4_3_norm_mbox_conf(f4_3))
                reg2 = self.conv4_3_norm_mbox_loc(f4_3)

                cls3 = tf.nn.softmax(self.conv5_3_norm_mbox_conf(f5_3))
                reg3 = self.conv5_3_norm_mbox_loc(f5_3)

                cls4 = tf.nn.softmax(self.fc7_mbox_conf(ffc7))
                reg4 = self.fc7_mbox_loc(ffc7)

                cls5 = tf.nn.softmax(self.conv6_2_mbox_conf(f6_2))
                reg5 = self.conv6_2_mbox_loc(f6_2)

                cls6 = tf.nn.softmax(self.conv7_2_mbox_conf(f7_2))
                reg6 = self.conv7_2_mbox_loc(f7_2)

                # 对conv3_3层的分类结果进行特殊处理：max-out背景标签
                # 这是S3FD算法的一个关键创新，用于提高小尺度人脸检测性能
                bmax = tf.maximum(tf.maximum(cls1[:, :, :, 0:1], cls1[:, :, :, 1:2]), cls1[:, :, :, 2:3])
                cls1 = tf.concat([bmax, cls1[:, :, :, 3:4]], axis=-1)
                cls1 = tf.nn.softmax(cls1)

                # 返回所有尺度的分类和回归结果
                return [cls1, reg1, cls2, reg2, cls3, reg3, cls4, reg4, cls5, reg5, cls6, reg6]

        # 根据配置决定在CPU还是GPU上加载模型
        e = None
        if place_model_on_cpu:
            e = tf.device("/CPU:0")

        if e is not None: e.__enter__()
        # 创建并加载模型
        self.model = S3FD()
        self.model.load_weights(model_path)
        if e is not None: e.__exit__(None, None, None)

        # 构建模型运行环境，输入为任意大小的RGB图像
        self.model.build_for_run([(tf.float32, nn.get4Dshape(None, None, 3))])

    def __enter__(self):
        """
        上下文管理器入口

        返回:
            S3FDExtractor实例自身
        """
        return self

    def __exit__(self, exc_type=None, exc_value=None, traceback=None):
        """
        上下文管理器出口

        参数:
            exc_type: 异常类型
            exc_value: 异常值
            traceback: 堆栈跟踪

        返回:
            False - 将异常传递到外层
        """
        return False  # 传递异常到外层

    def extract(self, input_image, is_bgr=True, is_remove_intersects=False):
        """
        从图像中提取人脸边界框

        参数:
            input_image: 输入图像数组
            is_bgr: 输入图像是否为BGR格式，默认为True
            is_remove_intersects: 是否移除相交的人脸框，默认为False

        返回:
            list: 人脸边界框列表，每个边界框为[l, t, r, b]格式（左、上、右、下坐标）
        """

        # 颜色空间转换：如果是BGR格式，转换为RGB
        if is_bgr:
            input_image = input_image[:, :, ::-1]
            is_bgr = False

        # 获取图像尺寸
        (h, w, ch) = input_image.shape

        # 计算缩放因子，保持图像纵横比
        d = max(w, h)
        scale_to = 640 if d >= 1280 else d / 2
        scale_to = max(64, scale_to)  # 确保最小尺寸为64

        # 计算输入缩放比例
        input_scale = d / scale_to
        # 缩放图像以提高检测效率
        input_image = cv2.resize(input_image, (int(w/input_scale), int(h/input_scale)), 
                                interpolation=cv2.INTER_LINEAR)

        # 运行模型进行检测
        olist = self.model.run([input_image[None, ...]])

        # 处理检测结果
        detected_faces = []
        # 对模型输出进行后处理，获取人脸边界框
        for ltrb in self.refine(olist):
            # 将边界框坐标缩放到原始图像尺寸
            l, t, r, b = [x * input_scale for x in ltrb]
            bt = b - t
            # 过滤过小的人脸（小于40像素的边界框）
            if min(r - l, bt) < 40:  # 过滤任何边长小于40像素的人脸
                continue
            # 稍微扩大底部边界，以更好地包含下巴区域
            # 这是为了与2DFAN-4关键点检测器更好地配合
            b += bt * 0.1
            detected_faces.append([int(x) for x in (l, t, r, b)])

        # 按面积大小对人脸框进行排序，面积大的排在前面
        detected_faces = [[(l, t, r, b), (r - l) * (b - t)] for (l, t, r, b) in detected_faces]
        detected_faces = sorted(detected_faces, key=operator.itemgetter(1), reverse=True)
        detected_faces = [x[0] for x in detected_faces]

        # 移除相交的人脸框（保留面积较大的）
        if is_remove_intersects:
            for i in range(len(detected_faces) - 1, 0, -1):
                l1, t1, r1, b1 = detected_faces[i]
                l0, t0, r0, b0 = detected_faces[i - 1]

                # 计算两个边界框的交集
                dx = min(r0, r1) - max(l0, l1)
                dy = min(b0, b1) - max(t0, t1)
                # 如果有交集，则移除当前人脸框
                if (dx >= 0) and (dy >= 0):
                    detected_faces.pop(i)

        return detected_faces

    def refine(self, olist):
        """
        处理模型输出，生成边界框列表

        参数:
            olist: 模型输出的分类和回归结果列表

        返回:
            list: 处理后的边界框列表
        """
        bboxlist = []
        # 遍历所有尺度的输出结果
        for i, ((ocls,), (oreg,)) in enumerate(zip(olist[::2], olist[1::2])):
            # 计算当前层的步长（每个特征图单元对应原图的像素数）
            stride = 2 ** (i + 2)  # 4, 8, 16, 32, 64, 128
            s_d2 = stride / 2
            s_m4 = stride * 4

            # 遍历所有置信度大于阈值的位置
            for hindex, windex in zip(*np.where(ocls[..., 1] > 0.05)):
                # 获取置信度分数
                score = ocls[hindex, windex, 1]
                # 获取回归偏移量
                loc = oreg[hindex, windex, :]
                # 计算先验框的中心和大小
                priors = np.array([windex * stride + s_d2, hindex * stride + s_d2, s_m4, s_m4])
                priors_2p = priors[2:]
                # 根据回归偏移量调整边界框
                box = np.concatenate((priors[:2] + loc[:2] * 0.1 * priors_2p,
                                      priors_2p * np.exp(loc[2:] * 0.2)))
                # 将中心点+大小格式转换为左上角+右下角格式
                box[:2] -= box[2:] / 2
                box[2:] += box[:2]

                # 添加边界框和置信度到列表
                bboxlist.append([*box, score])

        # 转换为numpy数组
        bboxlist = np.array(bboxlist)
        # 如果没有检测到人脸，返回空边界框
        if len(bboxlist) == 0:
            bboxlist = np.zeros((1, 5))

        # 应用非极大值抑制，过滤重叠边界框
        bboxlist = bboxlist[self.refine_nms(bboxlist, 0.3), :]
        # 过滤低置信度边界框，并转换为整数坐标
        bboxlist = [x[:-1].astype(np.int) for x in bboxlist if x[-1] >= 0.5]
        return bboxlist

    def refine_nms(self, dets, thresh):
        """
        非极大值抑制（NMS）算法实现

        参数:
            dets: 检测边界框数组，格式为[x1, y1, x2, y2, score]
            thresh: 重叠阈值，大于此值的重叠边界框将被抑制

        返回:
            list: 保留的边界框索引列表
        """
        keep = list()
        # 处理空输入情况
        if len(dets) == 0:
            return keep

        # 提取边界框坐标和置信度
        x_1, y_1, x_2, y_2, scores = dets[:, 0], dets[:, 1], dets[:, 2], dets[:, 3], dets[:, 4]
        # 计算每个边界框的面积
        areas = (x_2 - x_1 + 1) * (y_2 - y_1 + 1)
        # 按置信度降序排列
        order = scores.argsort()[::-1]

        keep = []
        # 迭代执行非极大值抑制
        while order.size > 0:
            # 保留置信度最高的边界框
            i = order[0]
            keep.append(i)
            # 计算当前边界框与其他边界框的交集
            xx_1, yy_1 = np.maximum(x_1[i], x_1[order[1:]]), np.maximum(y_1[i], y_1[order[1:]])
            xx_2, yy_2 = np.minimum(x_2[i], x_2[order[1:]]), np.minimum(y_2[i], y_2[order[1:]])

            # 计算交集区域的宽度和高度
            width, height = np.maximum(0.0, xx_2 - xx_1 + 1), np.maximum(0.0, yy_2 - yy_1 + 1)
            # 计算交并比(IoU)
            ovr = width * height / (areas[i] + areas[order[1:]] - width * height)

            # 保留重叠度小于阈值的边界框
            inds = np.where(ovr <= thresh)[0]
            order = order[inds + 1]
        return keep

【DFL学习日记#3】S3FDExtractor.py 检测人脸边界框

曼加塔第一期

曼加塔第二期