From e35cf6a8d95580dd5165fd70e3a5ddaa1526291c Mon Sep 17 00:00:00 2001
From: Vlad Zagorodniy <vladzzag@gmail.com>
Date: Tue, 3 Apr 2018 00:33:07 +0300
Subject: [PATCH] use box blur to render box shadows

---
 CMakeLists.txt                            |   2 -
 cmake/Modules/FindFFTW.cmake              |  20 ---
 libbreezecommon/CMakeLists.txt            |   7 +-
 libbreezecommon/breezeboxshadowhelper.cpp | 286 ++++++++++++------------------
 libbreezecommon/breezeboxshadowhelper.h   |   6 +-
 5 files changed, 116 insertions(+), 205 deletions(-)
 delete mode 100644 cmake/Modules/FindFFTW.cmake
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 08b152c1..0eff8af4 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -10,8 +10,6 @@ include(GenerateExportHeader)
 include(WriteBasicConfigVersionFile)
 include(FeatureSummary)
 
-set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake/Modules")
-
 if(USE_KDE4)
   find_package(KDE4 REQUIRED)
   include(KDE4Defaults)
diff --git a/cmake/Modules/FindFFTW.cmake b/cmake/Modules/FindFFTW.cmake
deleted file mode 100644
index c3214373..00000000
--- a/cmake/Modules/FindFFTW.cmake
+++ /dev/null
@@ -1,20 +0,0 @@
-# Find the FFTW library
-#
-# Usage:
-#   find_package(FFTW [REQUIRED])
-#
-# It sets the following variables:
-#   FFTW_FOUND
-#   FFTW_INCLUDES
-#   FFTW_LIBRARIES
-
-
-find_path(FFTW_INCLUDES fftw3.h)
-
-find_library(FFTW_LIBRARIES NAMES fftw3)
-
-include(FindPackageHandleStandardArgs)
-find_package_handle_standard_args(FFTW DEFAULT_MSG
-                                  FFTW_INCLUDES FFTW_LIBRARIES)
-
-mark_as_advanced(FFTW_INCLUDES FFTW_LIBRARIES)
diff --git a/libbreezecommon/CMakeLists.txt b/libbreezecommon/CMakeLists.txt
index af9848de..5233ed9f 100644
--- a/libbreezecommon/CMakeLists.txt
+++ b/libbreezecommon/CMakeLists.txt
@@ -11,8 +11,6 @@ if (BREEZE_COMMON_USE_KDE4)
 endif ()
 
 ################# dependencies #################
-### FFTW
-find_package(FFTW REQUIRED)
 
 ### Qt/KDE
 if (NOT BREEZE_COMMON_USE_KDE4)
@@ -35,7 +33,6 @@ if (BREEZE_COMMON_USE_KDE4)
         EXPORT_FILE_NAME breezecommon_export.h)
 
     target_link_libraries(breezecommon4 ${KDE4_KDEUI_LIBS})
-    target_link_libraries(breezecommon4 ${FFTW_LIBRARIES})
 
     set_target_properties(breezecommon4 PROPERTIES
         VERSION ${PROJECT_VERSION}
@@ -52,9 +49,7 @@ else ()
     target_link_libraries(breezecommon
         PUBLIC
             Qt5::Core
-            Qt5::Gui
-        PRIVATE
-            ${FFTW_LIBRARIES})
+            Qt5::Gui)
 
     set_target_properties(breezecommon PROPERTIES
         VERSION ${PROJECT_VERSION}
diff --git a/libbreezecommon/breezeboxshadowhelper.cpp b/libbreezecommon/breezeboxshadowhelper.cpp
index dea26e93..0aaa9f05 100644
--- a/libbreezecommon/breezeboxshadowhelper.cpp
+++ b/libbreezecommon/breezeboxshadowhelper.cpp
@@ -21,214 +21,155 @@
 #include "breezeboxshadowhelper.h"
 #include "config-breezecommon.h"
 
+// Qt
 #include <QVector>
 
-#include <fftw3.h>
-
+// std
 #include <cmath>
 
 
 namespace Breeze {
 namespace BoxShadowHelper {
 
-QVector<double> computeGaussianKernel(double radius, double sigma)
-{
-    QVector<double> kernel;
-    const int kernelSize = static_cast<int>(radius) * 2 + 1;
-
-    const double den = sqrt(2.0) * sigma;
-    double kernelNorm = 0.0;
-    double lastInt = 0.5 * std::erf((-radius - 0.5) / den);
-
-    for (int i = 0; i < kernelSize; i++) {
-        const double currInt = 0.5 * std::erf((i - radius + 0.5) / den);
-        const double w = currInt - lastInt;
-        kernel << w;
-        kernelNorm += w;
-        lastInt = currInt;
-    }
-
-    for (auto &w : kernel) {
-        w /= kernelNorm;
-    }
-
-    return kernel;
+namespace {
+// According to the CSS Level 3 spec, standard deviation must be equal to
+// half of the blur radius. https://www.w3.org/TR/css-backgrounds-3/#shadow-blur
+// Current window size is too small for sigma equal to half of the blur radius.
+// As a workaround, sigma blur scale is lowered. With the lowered sigma
+// blur scale, area under the kernel equals to 0.98, which is pretty enough.
+// Maybe, it should be changed in the future.
+const qreal BLUR_SIGMA_SCALE = 0.4375;
 }
 
-// Blur alpha channel of the given image using separable convolution
-// gaussian kernel. Not very efficient with big blur radii.
-void blurAlphaSeparable(QImage &img, double radius, double sigma)
+inline qreal radiusToSigma(qreal radius)
 {
-    const auto kernel = computeGaussianKernel(radius, sigma);
+    return radius * BLUR_SIGMA_SCALE;
+}
 
-    QImage tmp(img.height(), img.width(), img.format());
+inline int boxSizeToRadius(int boxSize)
+{
+    return (boxSize - 1) / 2;
+}
 
-    QRgb *imgData = reinterpret_cast<QRgb *>(img.scanLine(0));
-    QRgb *tmpData = reinterpret_cast<QRgb *>(tmp.scanLine(0));
-    const int imgStride = img.width();
-    const int tmpStride = tmp.width();
+class BoxBlurProfile
+{
+public:
+    BoxBlurProfile(int radius, int passes = 3);
 
-    const int shift = static_cast<int>(radius);
+    int padding() const;
+    QVector<int> boxSizes() const;
 
-    // Blur in X direction. Please note, the result is stored in a temporary
-    // transposed buffer. The result is transposed to read memory in linear order.
-    for (int y = 0; y < img.height(); y++) {
-        for (int x = 0; x < img.width(); x++) {
-            double alpha = 0.0;
+private:
+    int m_padding;
+    QVector<int> m_boxSizes;
+};
 
-            for (int i = 0; i < kernel.size(); i++) {
-                const int idx = y * imgStride + qBound(0, x + i - shift, img.width() - 1);
-                alpha += qAlpha(imgData[idx]) * kernel[i];
-            }
+BoxBlurProfile::BoxBlurProfile(int radius, int passes)
+{
+    const qreal sigma = radiusToSigma(radius);
 
-            const int idx = x * tmpStride + y;
-            tmpData[idx] = qRgba(0, 0, 0, static_cast<int>(alpha));
-        }
+    // Box sizes are computed according to the "Fast Almost-Gaussian Filtering"
+    // paper, see http://www.peterkovesi.com/papers/FastGaussianSmoothing.pdf
+    int lower = std::floor(std::sqrt(12 * std::pow(sigma, 2) / passes + 1));
+    if (lower % 2 == 0) {
+        lower--;
     }
-
-    // Blur in Y direction. The result is transposed again so size
-    // matches original image size.
-    for (int y = 0; y < tmp.height(); y++) {
-        for (int x = 0; x < tmp.width(); x++) {
-            double alpha = 0.0;
-
-            for (int i = 0; i < kernel.size(); i++) {
-                const int idx = y * tmpStride + qBound(0, x + i - shift, tmp.width() - 1);
-                alpha += qAlpha(tmpData[idx]) * kernel[i];
-            }
-
-            const int idx = x * imgStride + y;
-            imgData[idx] = qRgba(0, 0, 0, static_cast<int>(alpha));
-        }
+    const int upper = lower + 2;
+
+    const int threshold = std::round(
+        (12 * std::pow(sigma, 2)
+            - passes * std::pow(lower, 2)
+            - 4 * passes * lower
+            - 3 * passes)
+        / (-4 * lower - 4));
+
+    m_padding = radius;
+    for (int i = 0; i < passes; i++) {
+        m_boxSizes.append(i < threshold ? lower : upper);
     }
 }
 
-// Blur alpha channel of the given image using Fourier Transform.
-// It's somewhat efficient with big blur radii.
-//
-// It works as follows:
-//   - do FFT on given input image(it is expected, that the
-//     input image was padded before)
-//   - compute Gaussian kernel, pad it to the size of the input
-//     image, and do FFT on it
-//   - multiply the two in the frequency domain(element-wise)
-//   - transform the result back to "time domain"
-//
-void blurAlphaFFT(QImage &img, double radius, double sigma)
+int BoxBlurProfile::padding() const
 {
-    QRgb *data = reinterpret_cast<QRgb *>(img.scanLine(0));
-    const int size = img.width() * img.height();
-
-    // Use FFTW's malloc function so the returned pointer obeys any
-    // special alignment restrictions. (e.g. for SIMD acceleration, etc)
-    // See http://www.fftw.org/fftw3_doc/Memory-Allocation.html
-    fftw_complex *imageIn = fftw_alloc_complex(size);
-    fftw_complex *imageOut = fftw_alloc_complex(size);
-
-    for (int i = 0; i < size; i++) {
-        imageIn[i][0] = qAlpha(data[i]);
-        imageIn[i][1] = 0.0;
-    }
-
-    fftw_plan imageFFT = fftw_plan_dft_2d(
-        img.height(), img.width(),
-        imageIn, imageOut,
-        FFTW_FORWARD, FFTW_ESTIMATE);
-
-    fftw_plan imageIFFT = fftw_plan_dft_2d(
-        img.height(), img.width(),
-        imageOut, imageIn,
-        FFTW_BACKWARD, FFTW_ESTIMATE);
-
-    // The computed Gaussian kernel has to have the same size as the input image.
-    // Please note that the center of the computed Gaussian kernel is placed
-    // at the top-left corner and the whole kernel is wrapped around so we read
-    // result in linear order.
-    // Note: the kernel is computed by taking a product of two 1-D Gaussian kernels.
-    QVector<double> kernel(size, 0);
-    const QVector<double> kernel_ = computeGaussianKernel(radius, sigma);
-    const int kernelSize = kernel_.size();
-    const int shift = -static_cast<int>(radius);
-    for (int y = 0; y < kernelSize; y++) {
-        for (int x = 0; x < kernelSize; x++) {
-            const int j = (img.width() + x + shift) % img.width();
-            const int i = (img.height() + y + shift) % img.height();
-            kernel[j + i * img.width()] = kernel_[x] * kernel_[y];
-        }
-    }
-
-    fftw_complex *kernelIn = fftw_alloc_complex(kernel.size());
-    fftw_complex *kernelOut = fftw_alloc_complex(kernel.size());
+    return m_padding;
+}
 
-    for (int i = 0; i < size; i++) {
-        kernelIn[i][0] = kernel[i];
-        kernelIn[i][1] = 0.0;
-    }
+QVector<int> BoxBlurProfile::boxSizes() const
+{
+    return m_boxSizes;
+}
 
-    fftw_plan kernelFFT = fftw_plan_dft_2d(
-        img.height(), img.width(),
-        kernelIn, kernelOut,
-        FFTW_FORWARD, FFTW_ESTIMATE);
+inline void boxBlurPass(QImage& src, QImage& dst, int boxSize)
+{
+    const int stride = src.depth() >> 3;
+    const int alphaOffset = QSysInfo::ByteOrder == QSysInfo::BigEndian ? 0 : 3;
 
-    // Do actual FFT.
-    fftw_execute(imageFFT);
-    fftw_execute(kernelFFT);
+    const int radius = boxSizeToRadius(boxSize);
+    const qreal invSize = 1.0 / boxSize;
 
-    for (int i = 0; i < size; i++) {
-        const double re = imageOut[i][0] * kernelOut[i][0] - imageOut[i][1] * kernelOut[i][1];
-        const double im = imageOut[i][0] * kernelOut[i][1] + imageOut[i][1] * kernelOut[i][0];
-        imageOut[i][0] = re;
-        imageOut[i][1] = im;
-    }
+    for (int y = 0; y < src.height(); y++) {
+        uchar* srcAlpha = src.scanLine(y);
+        uchar* dstAlpha = dst.scanLine(0);
 
-    fftw_execute(imageIFFT);
+        srcAlpha += alphaOffset;
+        dstAlpha += alphaOffset + y * stride;
 
-    // Copy result back. Please note, result is scaled by `width x height` so we need to scale it down.
-    for (int i = 0; i < size; i++) {
-        data[i] = qRgba(0, 0, 0, imageIn[i][0] / size);
-    }
+        uchar* left = srcAlpha;
+        uchar* right = left + stride * radius;
 
-    fftw_destroy_plan(kernelFFT);
-    fftw_destroy_plan(imageFFT);
-    fftw_destroy_plan(imageIFFT);
+        int window = 0;
+        for (int x = 0; x < radius; x++) {
+            window += *srcAlpha;
+            srcAlpha += stride;
+        }
 
-    fftw_free(kernelIn);
-    fftw_free(kernelOut);
+        for (int x = 0; x <= radius; x++) {
+            window += *right;
+            right += stride;
+            *dstAlpha = static_cast<uchar>(window * invSize);
+            dstAlpha += dst.width() * stride;
+        }
 
-    fftw_free(imageIn);
-    fftw_free(imageOut);
-}
+        for (int x = radius + 1; x < src.width() - radius; x++) {
+            window += *right - *left;
+            left += stride;
+            right += stride;
+            *dstAlpha = static_cast<uchar>(window * invSize);
+            dstAlpha += dst.width() * stride;
+        }
 
-namespace {
-    // FFT approach outperforms separable convolution kernels when blur radius >= 64.
-    // (was discovered after doing a lot of benchmarks)
-    const int FFT_BLUR_RADIUS_THRESHOLD = 64;
-
-    // According to the CSS Level 3 spec, standard deviation must be equal to
-    // half of the blur radius. https://www.w3.org/TR/css-backgrounds-3/#shadow-blur
-    // Current window size is too small for sigma equal to half of the blur radius.
-    // As a workaround, sigma blur scale is lowered. With the lowered sigma
-    // blur scale, area under the kernel equals to 0.98, which is pretty enough.
-    // Maybe, it should be changed in the future.
-    const double SIGMA_BLUR_SCALE = 0.4375;
+        for (int x = src.width() - radius; x < src.width(); x++) {
+            window -= *left;
+            left += stride;
+            *dstAlpha = static_cast<uchar>(window * invSize);
+            dstAlpha += dst.width() * stride;
+        }
+    }
 }
 
-inline double radiusToSigma(double radius)
+void boxBlurAlpha(QImage& image, const BoxBlurProfile& profile)
 {
-    return SIGMA_BLUR_SCALE * radius;
+    // Temporary buffer is transposed so we always read memory
+    // in linear order.
+    QImage tmp(image.height(), image.width(), image.format());
+
+    const auto boxSizes = profile.boxSizes();
+    for (const auto& boxSize : boxSizes) {
+        boxBlurPass(image, tmp, boxSize); // horizontal pass
+        boxBlurPass(tmp, image, boxSize); // vertical pass
+    }
 }
 
-void boxShadow(QPainter *p, const QRect &box, const QPoint &offset, int radius, const QColor &color)
+void boxShadow(QPainter* p, const QRect& box, const QPoint& offset,
+               int radius, const QColor& color)
 {
-    const QSize size = box.size() + 2 * QSize(radius, radius);
-
 #if BREEZE_COMMON_USE_KDE4
     const qreal dpr = 1.0;
 #else
     const qreal dpr = p->device()->devicePixelRatioF();
 #endif
-
-    QPainter painter;
+    const BoxBlurProfile profile(radius * dpr, 3);
+    const QSize size = box.size() + 2 * QSize(profile.padding(), profile.padding());
 
     QImage shadow(size * dpr, QImage::Format_ARGB32_Premultiplied);
 #if !BREEZE_COMMON_USE_KDE4
@@ -236,19 +177,14 @@ void boxShadow(QPainter *p, const QRect &box, const QPoint &offset, int radius,
 #endif
     shadow.fill(Qt::transparent);
 
+    QPainter painter;
     painter.begin(&shadow);
-    painter.fillRect(QRect(QPoint(radius, radius), box.size()), Qt::black);
+    painter.fillRect(QRect(QPoint(profile.padding(), profile.padding()), box.size()), Qt::black);
     painter.end();
 
     // There is no need to blur RGB channels. Blur the alpha
     // channel and do compositing stuff later.
-    const double radius_ = radius * dpr;
-    const double sigma = radiusToSigma(radius_);
-    if (radius_ < FFT_BLUR_RADIUS_THRESHOLD) {
-        blurAlphaSeparable(shadow, radius_, sigma);
-    } else {
-        blurAlphaFFT(shadow, radius_, sigma);
-    }
+    boxBlurAlpha(shadow, profile);
 
     painter.begin(&shadow);
     painter.setCompositionMode(QPainter::CompositionMode_SourceIn);
@@ -261,5 +197,5 @@ void boxShadow(QPainter *p, const QRect &box, const QPoint &offset, int radius,
     p->drawImage(shadowRect, shadow);
 }
 
-} // BoxShadowHelper
-} // Breeze
+}
+}
diff --git a/libbreezecommon/breezeboxshadowhelper.h b/libbreezecommon/breezeboxshadowhelper.h
index 578b16ca..28ff2825 100644
--- a/libbreezecommon/breezeboxshadowhelper.h
+++ b/libbreezecommon/breezeboxshadowhelper.h
@@ -23,6 +23,7 @@
 
 #include "breezecommon_export.h"
 
+// Qt
 #include <QColor>
 #include <QPainter>
 #include <QPoint>
@@ -35,7 +36,8 @@ namespace BoxShadowHelper {
 void BREEZECOMMON_EXPORT boxShadow(QPainter *p, const QRect &box, const QPoint &offset,
                                    int radius, const QColor &color);
 
-} // BoxShadowHelper
-} // Breeze
+
+} // namespace BoxShadowHelper
+} // namespace Breeze
 
 #endif // BREEZE_COMMON_BOXSHADOWHELPER_H
-- 
2.16.3