Changeset View
Changeset View
Standalone View
Standalone View
libs/pigment/compositeops/KoOptimizedCompositeOpOver32.h
Show All 20 Lines | |||||
21 | #ifndef KOOPTIMIZEDCOMPOSITEOPOVER32_H_ | 21 | #ifndef KOOPTIMIZEDCOMPOSITEOPOVER32_H_ | ||
22 | #define KOOPTIMIZEDCOMPOSITEOPOVER32_H_ | 22 | #define KOOPTIMIZEDCOMPOSITEOPOVER32_H_ | ||
23 | 23 | | |||
24 | #include "KoCompositeOpBase.h" | 24 | #include "KoCompositeOpBase.h" | ||
25 | #include "KoCompositeOpRegistry.h" | 25 | #include "KoCompositeOpRegistry.h" | ||
26 | #include "KoStreamedMath.h" | 26 | #include "KoStreamedMath.h" | ||
27 | 27 | | |||
28 | 28 | | |||
29 | template<Vc::Implementation _impl> | ||||
30 | struct OptiDiv { | ||||
31 | static ALWAYS_INLINE float divScalar(const float& divident, const float& divisor) { | ||||
32 | #ifdef __SSE__ | ||||
33 | float result; | ||||
34 | | ||||
35 | __m128 x = _mm_set_ss(divisor); | ||||
36 | __m128 y = _mm_set_ss(divident); | ||||
37 | x = _mm_rcp_ss(x); | ||||
38 | x = _mm_mul_ss(x, y); | ||||
39 | | ||||
40 | | ||||
41 | _mm_store_ss(&result, x); | ||||
42 | return result; | ||||
43 | #else | ||||
44 | return divident / divisor; | ||||
45 | #endif | ||||
46 | | ||||
47 | } | ||||
48 | | ||||
49 | static ALWAYS_INLINE Vc::float_v divVector(Vc::float_v::AsArg divident, Vc::float_v::AsArg divisor) { | ||||
50 | #ifdef __SSE__ | ||||
51 | return divident * Vc::reciprocal(divisor); | ||||
52 | #else | ||||
53 | return divident / divisor; | ||||
dkazakov: In case SSE is not available we should use normal division, like `divident / divisor`, not `1.0… | |||||
54 | #endif | ||||
55 | | ||||
56 | } | ||||
57 | | ||||
58 | }; | ||||
59 | | ||||
60 | | ||||
29 | template<typename channels_type, typename pixel_type, bool alphaLocked, bool allChannelsFlag> | 61 | template<typename channels_type, typename pixel_type, bool alphaLocked, bool allChannelsFlag> | ||
30 | struct OverCompositor32 { | 62 | struct OverCompositor32 { | ||
31 | struct OptionalParams { | 63 | struct OptionalParams { | ||
32 | OptionalParams(const KoCompositeOp::ParameterInfo& params) | 64 | OptionalParams(const KoCompositeOp::ParameterInfo& params) | ||
33 | : channelFlags(params.channelFlags) | 65 | : channelFlags(params.channelFlags) | ||
34 | { | 66 | { | ||
35 | } | 67 | } | ||
36 | const QBitArray &channelFlags; | 68 | const QBitArray &channelFlags; | ||
▲ Show 20 Lines • Show All 55 Lines • ▼ Show 20 Line(s) | 74 | { | |||
92 | } else { | 124 | } else { | ||
93 | /** | 125 | /** | ||
94 | * The value of new_alpha can have *some* zero values, | 126 | * The value of new_alpha can have *some* zero values, | ||
95 | * which will result in NaN values while division. But | 127 | * which will result in NaN values while division. But | ||
96 | * when converted to integers these NaN values will | 128 | * when converted to integers these NaN values will | ||
97 | * be converted to zeroes, which is exactly what we need | 129 | * be converted to zeroes, which is exactly what we need | ||
98 | */ | 130 | */ | ||
99 | new_alpha = dst_alpha + (uint8Max - dst_alpha) * src_alpha * uint8MaxRec1; | 131 | new_alpha = dst_alpha + (uint8Max - dst_alpha) * src_alpha * uint8MaxRec1; | ||
100 | src_blend = src_alpha / new_alpha; | 132 | | ||
133 | // Optimized version of: | ||||
134 | // src_blend = src_alpha / new_alpha; | ||||
135 | src_blend = OptiDiv<_impl>::divVector(src_alpha, new_alpha); | ||||
136 | | ||||
101 | } | 137 | } | ||
102 | 138 | | |||
103 | if (!(src_blend == oneValue).isFull()) { | 139 | if (!(src_blend == oneValue).isFull()) { | ||
104 | KoStreamedMath<_impl>::template fetch_colors_32<true>(dst, dst_c1, dst_c2, dst_c3); | 140 | KoStreamedMath<_impl>::template fetch_colors_32<true>(dst, dst_c1, dst_c2, dst_c3); | ||
105 | 141 | | |||
106 | dst_c1 = src_blend * (src_c1 - dst_c1) + dst_c1; | 142 | dst_c1 = src_blend * (src_c1 - dst_c1) + dst_c1; | ||
107 | dst_c2 = src_blend * (src_c2 - dst_c2) + dst_c2; | 143 | dst_c2 = src_blend * (src_c2 - dst_c2) + dst_c2; | ||
108 | dst_c3 = src_blend * (src_c3 - dst_c3) + dst_c3; | 144 | dst_c3 = src_blend * (src_c3 - dst_c3) + dst_c3; | ||
▲ Show 20 Lines • Show All 42 Lines • ▼ Show 20 Line(s) | 185 | } else if (dstAlpha == 0.0) { | |||
151 | srcBlendNorm = 1.0; | 187 | srcBlendNorm = 1.0; | ||
152 | 188 | | |||
153 | if (!allChannelsFlag) { | 189 | if (!allChannelsFlag) { | ||
154 | pixel_type *d = reinterpret_cast<pixel_type*>(dst); | 190 | pixel_type *d = reinterpret_cast<pixel_type*>(dst); | ||
155 | *d = 0; // dstAlpha is already null | 191 | *d = 0; // dstAlpha is already null | ||
156 | } | 192 | } | ||
157 | } else { | 193 | } else { | ||
158 | dstAlpha += (uint8Max - dstAlpha) * srcAlpha * uint8Rec1; | 194 | dstAlpha += (uint8Max - dstAlpha) * srcAlpha * uint8Rec1; | ||
159 | srcBlendNorm = srcAlpha / dstAlpha; | 195 | // Optimized version of: | ||
196 | // srcBlendNorm = srcAlpha / dstAlpha); | ||||
197 | srcBlendNorm = OptiDiv<_impl>::divScalar(srcAlpha, dstAlpha); | ||||
198 | | ||||
160 | } | 199 | } | ||
161 | 200 | | |||
162 | if(allChannelsFlag) { | 201 | if(allChannelsFlag) { | ||
163 | if (srcBlendNorm == 1.0) { | 202 | if (srcBlendNorm == 1.0) { | ||
164 | if (!alphaLocked) { | 203 | if (!alphaLocked) { | ||
165 | const pixel_type *s = reinterpret_cast<const pixel_type*>(src); | 204 | const pixel_type *s = reinterpret_cast<const pixel_type*>(src); | ||
166 | pixel_type *d = reinterpret_cast<pixel_type*>(dst); | 205 | pixel_type *d = reinterpret_cast<pixel_type*>(dst); | ||
167 | *d = *s; | 206 | *d = *s; | ||
▲ Show 20 Lines • Show All 81 Lines • Show Last 20 Lines |
In case SSE is not available we should use normal division, like divident / divisor, not 1.0 / divisor. It is just faster.