diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt index da4bcbfbdd..b2741a49f2 100644 --- a/benchmarks/CMakeLists.txt +++ b/benchmarks/CMakeLists.txt @@ -1,90 +1,90 @@ set( EXECUTABLE_OUTPUT_PATH ${CMAKE_CURRENT_BINARY_DIR} ) include_directories( ${CMAKE_SOURCE_DIR}/sdk/tests ${CMAKE_SOURCE_DIR}/libs/pigment ${CMAKE_SOURCE_DIR}/libs/pigment/compositeops ) include_directories(SYSTEM ${EIGEN3_INCLUDE_DIR} ${Boost_INCLUDE_DIRS} ) set(LINK_VC_LIB) if(HAVE_VC) include_directories(${Vc_INCLUDE_DIR}) -# set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${Vc_DEFINITIONS}") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${Vc_DEFINITIONS}") set(LINK_VC_LIB ${Vc_LIBRARIES}) endif() macro_add_unittest_definitions() ########### next target ############### set(kis_datamanager_benchmark_SRCS kis_datamanager_benchmark.cpp) set(kis_hiterator_benchmark_SRCS kis_hline_iterator_benchmark.cpp) set(kis_viterator_benchmark_SRCS kis_vline_iterator_benchmark.cpp) set(kis_random_iterator_benchmark_SRCS kis_random_iterator_benchmark.cpp) set(kis_projection_benchmark_SRCS kis_projection_benchmark.cpp) set(kis_bcontrast_benchmark_SRCS kis_bcontrast_benchmark.cpp) set(kis_blur_benchmark_SRCS kis_blur_benchmark.cpp) set(kis_level_filter_benchmark_SRCS kis_level_filter_benchmark.cpp) set(kis_painter_benchmark_SRCS kis_painter_benchmark.cpp) set(kis_stroke_benchmark_SRCS kis_stroke_benchmark.cpp) set(kis_fast_math_benchmark_SRCS kis_fast_math_benchmark.cpp) set(kis_floodfill_benchmark_SRCS kis_floodfill_benchmark.cpp) set(kis_gradient_benchmark_SRCS kis_gradient_benchmark.cpp) set(kis_mask_generator_benchmark_SRCS kis_mask_generator_benchmark.cpp) set(kis_low_memory_benchmark_SRCS kis_low_memory_benchmark.cpp) set(kis_filter_selections_benchmark_SRCS kis_filter_selections_benchmark.cpp) if (UNIX) - #set(kis_composition_benchmark_SRCS kis_composition_benchmark.cpp) + set(kis_composition_benchmark_SRCS kis_composition_benchmark.cpp) endif() set(kis_thumbnail_benchmark_SRCS kis_thumbnail_benchmark.cpp) krita_add_benchmark(KisDatamanagerBenchmark TESTNAME krita-benchmarks-KisDataManager ${kis_datamanager_benchmark_SRCS}) krita_add_benchmark(KisHLineIteratorBenchmark TESTNAME krita-benchmarks-KisHLineIterator ${kis_hiterator_benchmark_SRCS}) krita_add_benchmark(KisVLineIteratorBenchmark TESTNAME krita-benchmarks-KisVLineIterator ${kis_viterator_benchmark_SRCS}) krita_add_benchmark(KisRandomIteratorBenchmark TESTNAME krita-benchmarks-KisRandomIterator ${kis_random_iterator_benchmark_SRCS}) krita_add_benchmark(KisProjectionBenchmark TESTNAME krita-benchmarks-KisProjectionBenchmark ${kis_projection_benchmark_SRCS}) krita_add_benchmark(KisBContrastBenchmark TESTNAME krita-benchmarks-KisBContrastBenchmark ${kis_bcontrast_benchmark_SRCS}) krita_add_benchmark(KisBlurBenchmark TESTNAME krita-benchmarks-KisBlurBenchmark ${kis_blur_benchmark_SRCS}) krita_add_benchmark(KisLevelFilterBenchmark TESTNAME krita-benchmarks-KisLevelFilterBenchmark ${kis_level_filter_benchmark_SRCS}) krita_add_benchmark(KisPainterBenchmark TESTNAME krita-benchmarks-KisPainterBenchmark ${kis_painter_benchmark_SRCS}) krita_add_benchmark(KisStrokeBenchmark TESTNAME krita-benchmarks-KisStrokeBenchmark ${kis_stroke_benchmark_SRCS}) krita_add_benchmark(KisFastMathBenchmark TESTNAME krita-benchmarks-KisFastMath ${kis_fast_math_benchmark_SRCS}) krita_add_benchmark(KisFloodfillBenchmark TESTNAME krita-benchmarks-KisFloodFill ${kis_floodfill_benchmark_SRCS}) krita_add_benchmark(KisGradientBenchmark TESTNAME krita-benchmarks-KisGradientFill ${kis_gradient_benchmark_SRCS}) krita_add_benchmark(KisMaskGeneratorBenchmark TESTNAME krita-benchmarks-KisMaskGenerator ${kis_mask_generator_benchmark_SRCS}) krita_add_benchmark(KisLowMemoryBenchmark TESTNAME krita-benchmarks-KisLowMemory ${kis_low_memory_benchmark_SRCS}) krita_add_benchmark(KisFilterSelectionsBenchmark TESTNAME krita-image-KisFilterSelectionsBenchmark ${kis_filter_selections_benchmark_SRCS}) if(UNIX) - #krita_add_benchmark(KisCompositionBenchmark TESTNAME krita-benchmarks-KisComposition ${kis_composition_benchmark_SRCS}) + krita_add_benchmark(KisCompositionBenchmark TESTNAME krita-benchmarks-KisComposition ${kis_composition_benchmark_SRCS}) endif() krita_add_benchmark(KisThumbnailBenchmark TESTNAME krita-benchmarks-KisThumbnail ${kis_thumbnail_benchmark_SRCS}) target_link_libraries(KisDatamanagerBenchmark kritaimage Qt5::Test) target_link_libraries(KisHLineIteratorBenchmark kritaimage Qt5::Test) target_link_libraries(KisVLineIteratorBenchmark kritaimage Qt5::Test) target_link_libraries(KisRandomIteratorBenchmark kritaimage Qt5::Test) target_link_libraries(KisProjectionBenchmark kritaimage kritaui Qt5::Test) target_link_libraries(KisBContrastBenchmark kritaimage Qt5::Test) target_link_libraries(KisBlurBenchmark kritaimage Qt5::Test) target_link_libraries(KisLevelFilterBenchmark kritaimage Qt5::Test) target_link_libraries(KisPainterBenchmark kritaimage Qt5::Test) target_link_libraries(KisStrokeBenchmark kritaimage Qt5::Test) target_link_libraries(KisFastMathBenchmark kritaimage Qt5::Test) target_link_libraries(KisFloodfillBenchmark kritaimage Qt5::Test) target_link_libraries(KisGradientBenchmark kritaimage Qt5::Test) target_link_libraries(KisLowMemoryBenchmark kritaimage Qt5::Test) target_link_libraries(KisFilterSelectionsBenchmark kritaimage Qt5::Test) if(UNIX) - #target_link_libraries(KisCompositionBenchmark kritaimage Qt5::Test ${LINK_VC_LIB}) - #if(HAVE_VC) - # set_property(TARGET KisCompositionBenchmark APPEND PROPERTY COMPILE_OPTIONS "${Vc_ARCHITECTURE_FLAGS}") - #endif() + target_link_libraries(KisCompositionBenchmark kritaimage Qt5::Test ${LINK_VC_LIB}) + if(HAVE_VC) + set_property(TARGET KisCompositionBenchmark APPEND PROPERTY COMPILE_OPTIONS "${Vc_ARCHITECTURE_FLAGS}") + endif() endif() target_link_libraries(KisMaskGeneratorBenchmark kritaimage Qt5::Test) target_link_libraries(KisThumbnailBenchmark kritaimage Qt5::Test) diff --git a/libs/pigment/compositeops/KoOptimizedCompositeOpOver32.h b/libs/pigment/compositeops/KoOptimizedCompositeOpOver32.h index e6f958a2cc..a1b7861968 100644 --- a/libs/pigment/compositeops/KoOptimizedCompositeOpOver32.h +++ b/libs/pigment/compositeops/KoOptimizedCompositeOpOver32.h @@ -1,248 +1,287 @@ /* * Copyright (c) 2006 Cyrille Berger * Copyright (c) 2011 Silvio Heinrich * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Library General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Library General Public License for more details. * * You should have received a copy of the GNU Library General Public License * along with this library; see the file COPYING.LIB. If not, write to * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, * Boston, MA 02110-1301, USA. */ #ifndef KOOPTIMIZEDCOMPOSITEOPOVER32_H_ #define KOOPTIMIZEDCOMPOSITEOPOVER32_H_ #include "KoCompositeOpBase.h" #include "KoCompositeOpRegistry.h" #include "KoStreamedMath.h" +template +struct OptiDiv { + static ALWAYS_INLINE float divScalar(const float& divident, const float& divisor) { +#ifdef __SSE__ + float result; + + __m128 x = _mm_set_ss(divisor); + __m128 y = _mm_set_ss(divident); + x = _mm_rcp_ss(x); + x = _mm_mul_ss(x, y); + + + _mm_store_ss(&result, x); + return result; +#else + return divident / divisor; +#endif + + } + + static ALWAYS_INLINE Vc::float_v divVector(Vc::float_v::AsArg divident, Vc::float_v::AsArg divisor) { +#ifdef __SSE__ + return divident * Vc::reciprocal(divisor); +#else + return divident / divisor; +#endif + + } + +}; + + template struct OverCompositor32 { struct OptionalParams { OptionalParams(const KoCompositeOp::ParameterInfo& params) : channelFlags(params.channelFlags) { } const QBitArray &channelFlags; }; // \see docs in AlphaDarkenCompositor32 template static ALWAYS_INLINE void compositeVector(const quint8 *src, quint8 *dst, const quint8 *mask, float opacity, const OptionalParams &oparams) { Q_UNUSED(oparams); Vc::float_v src_alpha; Vc::float_v dst_alpha; src_alpha = KoStreamedMath<_impl>::template fetch_alpha_32(src); bool haveOpacity = opacity != 1.0; Vc::float_v opacity_norm_vec(opacity); Vc::float_v uint8Max((float)255.0); Vc::float_v uint8MaxRec1((float)1.0 / 255.0); Vc::float_v zeroValue(Vc::Zero); Vc::float_v oneValue(Vc::One); src_alpha *= opacity_norm_vec; if (haveMask) { Vc::float_v mask_vec = KoStreamedMath<_impl>::fetch_mask_8(mask); src_alpha *= mask_vec * uint8MaxRec1; } // The source cannot change the colors in the destination, // since its fully transparent if ((src_alpha == zeroValue).isFull()) { return; } dst_alpha = KoStreamedMath<_impl>::template fetch_alpha_32(dst); Vc::float_v src_c1; Vc::float_v src_c2; Vc::float_v src_c3; Vc::float_v dst_c1; Vc::float_v dst_c2; Vc::float_v dst_c3; KoStreamedMath<_impl>::template fetch_colors_32(src, src_c1, src_c2, src_c3); Vc::float_v src_blend; Vc::float_v new_alpha; if ((dst_alpha == uint8Max).isFull()) { new_alpha = dst_alpha; src_blend = src_alpha * uint8MaxRec1; } else if ((dst_alpha == zeroValue).isFull()) { new_alpha = src_alpha; src_blend = oneValue; } else { /** * The value of new_alpha can have *some* zero values, * which will result in NaN values while division. But * when converted to integers these NaN values will * be converted to zeroes, which is exactly what we need */ new_alpha = dst_alpha + (uint8Max - dst_alpha) * src_alpha * uint8MaxRec1; - src_blend = src_alpha / new_alpha; + + // Optimized version of: + // src_blend = src_alpha / new_alpha; + src_blend = OptiDiv<_impl>::divVector(src_alpha, new_alpha); + } if (!(src_blend == oneValue).isFull()) { KoStreamedMath<_impl>::template fetch_colors_32(dst, dst_c1, dst_c2, dst_c3); dst_c1 = src_blend * (src_c1 - dst_c1) + dst_c1; dst_c2 = src_blend * (src_c2 - dst_c2) + dst_c2; dst_c3 = src_blend * (src_c3 - dst_c3) + dst_c3; } else { if (!haveMask && !haveOpacity) { memcpy(dst, src, 4 * Vc::float_v::size()); return; } else { // opacity has changed the alpha of the source, // so we can't just memcpy the bytes dst_c1 = src_c1; dst_c2 = src_c2; dst_c3 = src_c3; } } KoStreamedMath<_impl>::write_channels_32(dst, new_alpha, dst_c1, dst_c2, dst_c3); } template static ALWAYS_INLINE void compositeOnePixelScalar(const channels_type *src, channels_type *dst, const quint8 *mask, float opacity, const OptionalParams &oparams) { using namespace Arithmetic; const qint32 alpha_pos = 3; const float uint8Rec1 = 1.0 / 255.0; const float uint8Max = 255.0; float srcAlpha = src[alpha_pos]; srcAlpha *= opacity; if (haveMask) { srcAlpha *= float(*mask) * uint8Rec1; } if (srcAlpha != 0.0) { float dstAlpha = dst[alpha_pos]; float srcBlendNorm; if (dstAlpha == uint8Max) { srcBlendNorm = srcAlpha * uint8Rec1; } else if (dstAlpha == 0.0) { dstAlpha = srcAlpha; srcBlendNorm = 1.0; if (!allChannelsFlag) { pixel_type *d = reinterpret_cast(dst); *d = 0; // dstAlpha is already null } } else { dstAlpha += (uint8Max - dstAlpha) * srcAlpha * uint8Rec1; - srcBlendNorm = srcAlpha / dstAlpha; + // Optimized version of: + // srcBlendNorm = srcAlpha / dstAlpha); + srcBlendNorm = OptiDiv<_impl>::divScalar(srcAlpha, dstAlpha); + } if(allChannelsFlag) { if (srcBlendNorm == 1.0) { if (!alphaLocked) { const pixel_type *s = reinterpret_cast(src); pixel_type *d = reinterpret_cast(dst); *d = *s; } else { dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; } } else if (srcBlendNorm != 0.0){ dst[0] = KoStreamedMath<_impl>::lerp_mixed_u8_float(dst[0], src[0], srcBlendNorm); dst[1] = KoStreamedMath<_impl>::lerp_mixed_u8_float(dst[1], src[1], srcBlendNorm); dst[2] = KoStreamedMath<_impl>::lerp_mixed_u8_float(dst[2], src[2], srcBlendNorm); } } else { const QBitArray &channelFlags = oparams.channelFlags; if (srcBlendNorm == 1.0) { if(channelFlags.at(0)) dst[0] = src[0]; if(channelFlags.at(1)) dst[1] = src[1]; if(channelFlags.at(2)) dst[2] = src[2]; } else if (srcBlendNorm != 0.0) { if(channelFlags.at(0)) dst[0] = KoStreamedMath<_impl>::lerp_mixed_u8_float(dst[0], src[0], srcBlendNorm); if(channelFlags.at(1)) dst[1] = KoStreamedMath<_impl>::lerp_mixed_u8_float(dst[1], src[1], srcBlendNorm); if(channelFlags.at(2)) dst[2] = KoStreamedMath<_impl>::lerp_mixed_u8_float(dst[2], src[2], srcBlendNorm); } } if (!alphaLocked) { dst[alpha_pos] = KoStreamedMath<_impl>::round_float_to_uint(dstAlpha); } } } }; /** * An optimized version of a composite op for the use in 4 byte * colorspaces with alpha channel placed at the last byte of * the pixel: C1_C2_C3_A. */ template class KoOptimizedCompositeOpOver32 : public KoCompositeOp { public: KoOptimizedCompositeOpOver32(const KoColorSpace* cs) : KoCompositeOp(cs, COMPOSITE_OVER, i18n("Normal"), KoCompositeOp::categoryMix()) {} using KoCompositeOp::composite; virtual void composite(const KoCompositeOp::ParameterInfo& params) const { if(params.maskRowStart) { composite(params); } else { composite(params); } } template inline void composite(const KoCompositeOp::ParameterInfo& params) const { if (params.channelFlags.isEmpty() || params.channelFlags == QBitArray(4, true)) { KoStreamedMath<_impl>::template genericComposite32 >(params); } else { const bool allChannelsFlag = params.channelFlags.at(0) && params.channelFlags.at(1) && params.channelFlags.at(2); const bool alphaLocked = !params.channelFlags.at(3); if (allChannelsFlag && alphaLocked) { KoStreamedMath<_impl>::template genericComposite32_novector >(params); } else if (!allChannelsFlag && !alphaLocked) { KoStreamedMath<_impl>::template genericComposite32_novector >(params); } else /*if (!allChannelsFlag && alphaLocked) */{ KoStreamedMath<_impl>::template genericComposite32_novector >(params); } } } }; #endif // KOOPTIMIZEDCOMPOSITEOPOVER32_H_