diff --git a/CMakeLists.txt b/CMakeLists.txt
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -10,8 +10,6 @@
 include(WriteBasicConfigVersionFile)
 include(FeatureSummary)
 
-set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake/Modules")
-
 if(USE_KDE4)
   find_package(KDE4 REQUIRED)
   include(KDE4Defaults)
diff --git a/cmake/Modules/FindFFTW.cmake b/cmake/Modules/FindFFTW.cmake
deleted file mode 100644
--- a/cmake/Modules/FindFFTW.cmake
+++ /dev/null
@@ -1,20 +0,0 @@
-# Find the FFTW library
-#
-# Usage:
-#   find_package(FFTW [REQUIRED])
-#
-# It sets the following variables:
-#   FFTW_FOUND
-#   FFTW_INCLUDES
-#   FFTW_LIBRARIES
-
-
-find_path(FFTW_INCLUDES fftw3.h)
-
-find_library(FFTW_LIBRARIES NAMES fftw3)
-
-include(FindPackageHandleStandardArgs)
-find_package_handle_standard_args(FFTW DEFAULT_MSG
-                                  FFTW_INCLUDES FFTW_LIBRARIES)
-
-mark_as_advanced(FFTW_INCLUDES FFTW_LIBRARIES)
diff --git a/libbreezecommon/CMakeLists.txt b/libbreezecommon/CMakeLists.txt
--- a/libbreezecommon/CMakeLists.txt
+++ b/libbreezecommon/CMakeLists.txt
@@ -11,9 +11,6 @@
 endif ()
 
 ################# dependencies #################
-### FFTW
-find_package(FFTW REQUIRED)
-
 ### Qt/KDE
 if (NOT BREEZE_COMMON_USE_KDE4)
     find_package(Qt5 REQUIRED CONFIG COMPONENTS Widgets)
@@ -35,7 +32,6 @@
         EXPORT_FILE_NAME breezecommon_export.h)
 
     target_link_libraries(breezecommon4 ${KDE4_KDEUI_LIBS})
-    target_link_libraries(breezecommon4 ${FFTW_LIBRARIES})
 
     set_target_properties(breezecommon4 PROPERTIES
         VERSION ${PROJECT_VERSION}
@@ -52,9 +48,7 @@
     target_link_libraries(breezecommon5
         PUBLIC
             Qt5::Core
-            Qt5::Gui
-        PRIVATE
-            ${FFTW_LIBRARIES})
+            Qt5::Gui)
 
     set_target_properties(breezecommon5 PROPERTIES
         VERSION ${PROJECT_VERSION}
diff --git a/libbreezecommon/breezeboxshadowhelper.cpp b/libbreezecommon/breezeboxshadowhelper.cpp
--- a/libbreezecommon/breezeboxshadowhelper.cpp
+++ b/libbreezecommon/breezeboxshadowhelper.cpp
@@ -23,255 +23,166 @@
 
 #include <QVector>
 
-#include <fftw3.h>
-
 #include <cmath>
 
 
 namespace Breeze {
 namespace BoxShadowHelper {
 
 namespace {
-    // FFT approach outperforms naive blur method when blur radius >= 64.
-    // (was discovered after doing a lot of benchmarks)
-    const int FFT_BLUR_RADIUS_THRESHOLD = 64;
-
-    // According to the CSS Level 3 spec, standard deviation must be equal to
-    // half of the blur radius. https://www.w3.org/TR/css-backgrounds-3/#shadow-blur
-    // Current window size is too small for sigma equal to half of the blur radius.
-    // As a workaround, sigma blur scale is lowered. With the lowered sigma
-    // blur scale, area under the kernel equals to 0.98, which is pretty enough.
-    // Maybe, it should be changed in the future.
-    const double SIGMA_BLUR_SCALE = 0.4375;
+// According to the CSS Level 3 spec, standard deviation must be equal to
+// half of the blur radius. https://www.w3.org/TR/css-backgrounds-3/#shadow-blur
+// Current window size is too small for sigma equal to half of the blur radius.
+// As a workaround, sigma blur scale is lowered. With the lowered sigma
+// blur scale, area under the kernel equals to 0.98, which is pretty enough.
+// Maybe, it should be changed in the future.
+const qreal BLUR_SIGMA_SCALE = 0.4375;
 }
 
-inline int kernelSizeToRadius(int kernelSize)
+inline qreal radiusToSigma(qreal radius)
 {
-    return (kernelSize - 1) / 2;
+    return radius * BLUR_SIGMA_SCALE;
 }
 
-inline int radiusToKernelSize(int radius)
+inline int boxSizeToRadius(int boxSize)
 {
-    return radius * 2 + 1;
+    return (boxSize - 1) / 2;
 }
 
-QVector<double> computeGaussianKernel(int radius)
+class BoxBlurProfile
 {
-    QVector<double> kernel;
-    const int kernelSize = radiusToKernelSize(radius);
-    kernel.reserve(kernelSize);
-
-    const double sigma = SIGMA_BLUR_SCALE * radius;
-    const double den = std::sqrt(2.0) * sigma;
-    double kernelNorm = 0.0;
-    double lastInt = 0.5 * std::erf((-radius - 0.5) / den);
-
-    for (int i = 0; i < kernelSize; i++) {
-        const double currInt = 0.5 * std::erf((i - radius + 0.5) / den);
-        const double w = currInt - lastInt;
-        kernel << w;
-        kernelNorm += w;
-        lastInt = currInt;
-    }
+public:
+    BoxBlurProfile(int radius, int passes = 3);
 
-    for (auto &w : kernel) {
-        w /= kernelNorm;
-    }
+    int padding() const;
+    QVector<int> boxSizes() const;
 
-    return kernel;
-}
+private:
+    int m_padding;
+    QVector<int> m_boxSizes;
+};
 
-// Do horizontal pass of the Gaussian filter. Please notice that the result
-// is transposed. So, the dst image should have proper size, e.g. if the src
-// image have (wxh) size then the dst image should have (hxw) size. The
-// result is transposed so we read memory in linear order.
-void blurAlphaNaivePass(const QImage &src, QImage &dst, const QVector<double> &kernel)
+BoxBlurProfile::BoxBlurProfile(int radius, int passes)
 {
-    const int alphaOffset = QSysInfo::ByteOrder == QSysInfo::BigEndian ? 0 : 3;
-    const int alphaStride = src.depth() >> 3;
-    const int radius = kernelSizeToRadius(kernel.size());
-
-    for (int y = 0; y < src.height(); y++) {
-        const uchar *in = src.scanLine(y) + alphaOffset;
-        uchar *out = dst.scanLine(0) + alphaOffset + y * alphaStride;
-
-        for (int x = 0; x < radius; x++) {
-            const uchar *window = in;
-            double alpha = 0.0;
-            for (int k = radius - x; k < kernel.size(); k++) {
-                alpha += *window * kernel[k];
-                window += alphaStride;
-            }
-            *out = static_cast<uchar>(alpha);
-            out += dst.width() * alphaStride;
-        }
+    const qreal sigma = radiusToSigma(radius);
 
-        for (int x = radius; x < src.width() - radius; x++) {
-            const uchar *window = in + (x - radius) * alphaStride;
-            double alpha = 0.0;
-            for (int k = 0; k < kernel.size(); k++) {
-                alpha += *window * kernel[k];
-                window += alphaStride;
-            }
-            *out = static_cast<uchar>(alpha);
-            out += dst.width() * alphaStride;
-        }
+    // Box sizes are computed according to the "Fast Almost-Gaussian Filtering"
+    // paper by Peter Kovesi.
+    int lower = std::floor(std::sqrt(12 * std::pow(sigma, 2) / passes + 1));
+    if (lower % 2 == 0) {
+        lower--;
+    }
 
-        for (int x = src.width() - radius; x < src.width(); x++) {
-            const uchar *window = in + (x - radius - 1) * alphaStride;
-            double alpha = 0.0;
-            const int outside = x + radius - src.width();
-            for (int k = 0; k < kernel.size() - outside; k++) {
-                alpha += *window * kernel[k];
-                window += alphaStride;
-            }
-            *out = static_cast<uchar>(alpha);
-            out += dst.width() * alphaStride;
-        }
+    const int upper = lower + 2;
+    const int threshold = std::round((12 * std::pow(sigma, 2) - passes * std::pow(lower, 2)
+        - 4 * passes * lower - 3 * passes) / (-4 * lower - 4));
+
+    m_boxSizes.reserve(passes);
+    for (int i = 0; i < passes; ++i) {
+        m_boxSizes.append(i < threshold ? lower : upper);
     }
+
+    m_padding = radius;
 }
 
-// Blur alpha channel of the given image using separable convolution
-// gaussian kernel. Not very efficient with big blur radii.
-void blurAlphaNaive(QImage &img, int radius)
+int BoxBlurProfile::padding() const
 {
-    const QVector<double> kernel = computeGaussianKernel(radius);
-    QImage tmp(img.height(), img.width(), img.format());
+    return m_padding;
+}
 
-    blurAlphaNaivePass(img, tmp, kernel); // horizontal pass
-    blurAlphaNaivePass(tmp, img, kernel); // vertical pass
+QVector<int> BoxBlurProfile::boxSizes() const
+{
+    return m_boxSizes;
 }
 
-// Blur alpha channel of the given image using Fourier Transform.
-// It's somewhat efficient with big blur radii.
-//
-// It works as follows:
-//   - do FFT on given input image(it is expected, that the
-//     input image was padded before)
-//   - compute Gaussian kernel, pad it to the size of the input
-//     image, and do FFT on it
-//   - multiply the two in the frequency domain(element-wise)
-//   - transform the result back to "time domain"
-//
-void blurAlphaFFT(QImage &img, int radius)
+void boxBlurPass(const QImage &src, QImage &dst, int boxSize)
 {
+    const int alphaStride = src.depth() >> 3;
     const int alphaOffset = QSysInfo::ByteOrder == QSysInfo::BigEndian ? 0 : 3;
-    const int alphaStride = img.depth() >> 3;
-    const int size = img.width() * img.height();
-
-    // Use FFTW's malloc function so the returned pointer obeys any
-    // special alignment restrictions. (e.g. for SIMD acceleration, etc)
-    // See http://www.fftw.org/fftw3_doc/MekernelSizeToRadius(mory-Allocation.html
-    fftw_complex *imageIn = fftw_alloc_complex(size);
-    fftw_complex *imageOut = fftw_alloc_complex(size);
-
-    uchar *data = img.scanLine(0) + alphaOffset;
-    for (int i = 0; i < size; i++) {
-        imageIn[i][0] = *data;
-        imageIn[i][1] = 0.0;
-        data += alphaStride;
-    }
 
-    fftw_plan imageFFT = fftw_plan_dft_2d(
-        img.height(), img.width(),
-        imageIn, imageOut,
-        FFTW_FORWARD, FFTW_ESTIMATE);
-
-    fftw_plan imageIFFT = fftw_plan_dft_2d(
-        img.height(), img.width(),
-        imageOut, imageIn,
-        FFTW_BACKWARD, FFTW_ESTIMATE);
-
-    // The computed Gaussian kernel has to have the same size as the input image.
-    // Please note that the center of the computed Gaussian kernel is placed
-    // at the top-left corner and the whole kernel is wrapped around so we read
-    // result in linear order.
-    // Note: the kernel is computed by taking a product of two 1-D Gaussian kernels.
-    QVector<double> kernel(size, 0);
-    const QVector<double> kernel_ = computeGaussianKernel(radius);
-    for (int y = 0; y < kernel_.size(); y++) {
-        const int i = (img.height() + y - radius) % img.height();
-        for (int x = 0; x < kernel_.size(); x++) {
-            const int j = (img.width() + x - radius) % img.width();
-            kernel[j + i * img.width()] = kernel_[x] * kernel_[y];
-        }
-    }
+    const int radius = boxSizeToRadius(boxSize);
+    const qreal invSize = 1.0 / boxSize;
 
-    fftw_complex *kernelIn = fftw_alloc_complex(kernel.size());
-    fftw_complex *kernelOut = fftw_alloc_complex(kernel.size());
+    const int dstStride = dst.width() * alphaStride;
 
-    for (int i = 0; i < size; i++) {
-        kernelIn[i][0] = kernel[i];
-        kernelIn[i][1] = 0.0;
-    }
-
-    fftw_plan kernelFFT = fftw_plan_dft_2d(
-        img.height(), img.width(),
-        kernelIn, kernelOut,
-        FFTW_FORWARD, FFTW_ESTIMATE);
+    for (int y = 0; y < src.height(); ++y) {
+        const uchar *srcAlpha = src.scanLine(y);
+        uchar *dstAlpha = dst.scanLine(0);
 
-    // Do actual FFT.
-    fftw_execute(imageFFT);
-    fftw_execute(kernelFFT);
+        srcAlpha += alphaOffset;
+        dstAlpha += alphaOffset + y * alphaStride;
 
-    for (int i = 0; i < size; i++) {
-        const double re = imageOut[i][0] * kernelOut[i][0] - imageOut[i][1] * kernelOut[i][1];
-        const double im = imageOut[i][0] * kernelOut[i][1] + imageOut[i][1] * kernelOut[i][0];
-        imageOut[i][0] = re;
-        imageOut[i][1] = im;
-    }
+        const uchar *left = srcAlpha;
+        const uchar *right = left + alphaStride * radius;
 
-    fftw_execute(imageIFFT);
-
-    // Copy result back. Please note, result is scaled by `width x height` so we need to scale it down.
-    const double invSize = 1.0 / size;
-    data = img.scanLine(0) + alphaOffset;
-    for (int i = 0; i < size; i++) {
-        *data = imageIn[i][0] * invSize;
-        data += alphaStride;
-    }
+        int window = 0;
+        for (int x = 0; x < radius; ++x) {
+            window += *srcAlpha;
+            srcAlpha += alphaStride;
+        }
 
-    fftw_destroy_plan(kernelFFT);
-    fftw_destroy_plan(imageFFT);
-    fftw_destroy_plan(imageIFFT);
+        for (int x = 0; x <= radius; ++x) {
+            window += *right;
+            right += alphaStride;
+            *dstAlpha = static_cast<uchar>(window * invSize);
+            dstAlpha += dstStride;
+        }
 
-    fftw_free(kernelIn);
-    fftw_free(kernelOut);
+        for (int x = radius + 1; x < src.width() - radius; ++x) {
+            window += *right - *left;
+            left += alphaStride;
+            right += alphaStride;
+            *dstAlpha = static_cast<uchar>(window * invSize);
+            dstAlpha += dstStride;
+        }
 
-    fftw_free(imageIn);
-    fftw_free(imageOut);
+        for (int x = src.width() - radius; x < src.width(); ++x) {
+            window -= *left;
+            left += alphaStride;
+            *dstAlpha = static_cast<uchar>(window * invSize);
+            dstAlpha += dstStride;
+        }
+    }
 }
 
-void boxShadow(QPainter *p, const QRect &box, const QPoint &offset, int radius, const QColor &color)
+void boxBlurAlpha(QImage &image, const BoxBlurProfile &profile)
 {
-    const QSize size = box.size() + 2 * QSize(radius, radius);
+    // Temporary buffer is transposed so we always read memory
+    // in linear order.
+    QImage tmp(image.height(), image.width(), image.format());
+
+    const auto boxSizes = profile.boxSizes();
+    for (const int &boxSize : boxSizes) {
+        boxBlurPass(image, tmp, boxSize); // horizontal pass
+        boxBlurPass(tmp, image, boxSize); // vertical pass
+    }
+}
 
+void boxShadow(QPainter *p, const QRect &box, const QPoint &offset,
+               int radius, const QColor &color)
+{
 #if BREEZE_COMMON_USE_KDE4
     const qreal dpr = 1.0;
 #else
     const qreal dpr = p->device()->devicePixelRatioF();
 #endif
-
-    QPainter painter;
+    const BoxBlurProfile profile(radius * dpr, 3);
+    const QSize size = box.size() + 2 * QSize(profile.padding(), profile.padding());
 
     QImage shadow(size * dpr, QImage::Format_ARGB32_Premultiplied);
 #if !BREEZE_COMMON_USE_KDE4
     shadow.setDevicePixelRatio(dpr);
 #endif
     shadow.fill(Qt::transparent);
 
+    QPainter painter;
     painter.begin(&shadow);
-    painter.fillRect(QRect(QPoint(radius, radius), box.size()), Qt::black);
+    painter.fillRect(QRect(QPoint(profile.padding(), profile.padding()), box.size()), Qt::black);
     painter.end();
 
     // There is no need to blur RGB channels. Blur the alpha
-    // channel and then give the shadow a tint of the desired color.
-    const int radius_ = radius * dpr;
-    if (radius_ < FFT_BLUR_RADIUS_THRESHOLD) {
-        blurAlphaNaive(shadow, radius_);
-    } else {
-        blurAlphaFFT(shadow, radius_);
-    }
+    // channel and do compositing stuff later.
+    boxBlurAlpha(shadow, profile);
 
     painter.begin(&shadow);
     painter.setCompositionMode(QPainter::CompositionMode_SourceIn);