1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
|
/* nlmeans_x86.c
Copyright (c) 2013 Dirk Farin
Copyright (c) 2003-2019 HandBrake Team
This file is part of the HandBrake source code
Homepage: <http://handbrake.fr/>.
It may be used under the terms of the GNU General Public License v2.
For full terms see the file COPYING file or visit http://www.gnu.org/licenses/gpl-2.0.html
*/
#include "handbrake.h" // needed for ARCH_X86
#if defined(ARCH_X86)
#include <emmintrin.h>
#include "libavutil/cpu.h"
#include "nlmeans.h"
static void build_integral_sse2(uint32_t *integral,
int integral_stride,
const uint8_t *src,
const uint8_t *src_pre,
const uint8_t *compare,
const uint8_t *compare_pre,
int w,
int border,
int dst_w,
int dst_h,
int dx,
int dy)
{
const __m128i zero = _mm_set1_epi8(0);
const int bw = w + 2 * border;
for (int y = 0; y < dst_h; y++)
{
__m128i prevadd = _mm_set1_epi32(0);
const uint8_t *p1 = src_pre + y*bw;
const uint8_t *p2 = compare_pre + (y+dy)*bw + dx;
uint32_t *out = integral + (y*integral_stride);
for (int x = 0; x < dst_w; x += 16)
{
__m128i pa, pb;
__m128i pla, plb;
__m128i ldiff, lldiff, lhdiff;
__m128i ltmp,htmp;
__m128i ladd,hadd;
__m128i pha,phb;
__m128i hdiff,hldiff,hhdiff;
__m128i l2tmp,h2tmp;
pa = _mm_loadu_si128((__m128i*)p1); // Load source pixels into register 1
pb = _mm_loadu_si128((__m128i*)p2); // Load compare pixels into register 2
// Low
pla = _mm_unpacklo_epi8(pa,zero); // Unpack and interleave source low with zeros
plb = _mm_unpacklo_epi8(pb,zero); // Unpack and interleave compare low with zeros
ldiff = _mm_sub_epi16(pla,plb); // Diff source and compare lows (subtract)
ldiff = _mm_mullo_epi16(ldiff,ldiff); // Square low diff (multiply at 32-bit precision)
lldiff = _mm_unpacklo_epi16(ldiff,zero); // Unpack and interleave diff low with zeros
lhdiff = _mm_unpackhi_epi16(ldiff,zero); // Unpack and interleave diff high with zeros
ltmp = _mm_slli_si128(lldiff, 4); // Temp shift diff low left 4 bytes
lldiff = _mm_add_epi32(lldiff, ltmp); // Add above to diff low
ltmp = _mm_slli_si128(lldiff, 8); // Temp shift diff low left 8 bytes
lldiff = _mm_add_epi32(lldiff, ltmp); // Add above to diff low
lldiff = _mm_add_epi32(lldiff, prevadd); // Add previous total to diff low
ladd = _mm_shuffle_epi32(lldiff, 0xff); // Shuffle diff low
htmp = _mm_slli_si128(lhdiff, 4); // Temp shift diff high left 4 bytes
lhdiff = _mm_add_epi32(lhdiff, htmp); // Add above to diff high
htmp = _mm_slli_si128(lhdiff, 8); // Temp shift diff high left 8 bytes
lhdiff = _mm_add_epi32(lhdiff, htmp); // Add above to diff high
lhdiff = _mm_add_epi32(lhdiff, ladd); // Add shuffled diff low to diff high
prevadd = _mm_shuffle_epi32(lhdiff, 0xff); // Shuffle diff high
// High
pha = _mm_unpackhi_epi8(pa,zero); // Unpack and interleave source high with zeros
phb = _mm_unpackhi_epi8(pb,zero); // Unpack and interleave compare high with zeros
hdiff = _mm_sub_epi16(pha,phb); // Diff source and compare highs (subtract)
hdiff = _mm_mullo_epi16(hdiff,hdiff); // Square high diff (multiply at 32-bit precision)
hldiff = _mm_unpacklo_epi16(hdiff,zero); // Unpack and interleave diff low with zeros
hhdiff = _mm_unpackhi_epi16(hdiff,zero); // Unpack and interleave diff high with zeros
l2tmp = _mm_slli_si128(hldiff, 4); // Temp shift diff low 4 bytes
hldiff = _mm_add_epi32(hldiff, l2tmp); // Add above to diff low
l2tmp = _mm_slli_si128(hldiff, 8); // Temp shift diff low left 8 bytes
hldiff = _mm_add_epi32(hldiff, l2tmp); // Add above to diff low
hldiff = _mm_add_epi32(hldiff, prevadd); // Add previous total to diff low
hadd = _mm_shuffle_epi32(hldiff, 0xff); // Shuffle diff low
h2tmp = _mm_slli_si128(hhdiff, 4); // Temp shift diff high left 4 bytes
hhdiff = _mm_add_epi32(hhdiff, h2tmp); // Add above to diff high
h2tmp = _mm_slli_si128(hhdiff, 8); // Temp shift diff high left 8 bytes
hhdiff = _mm_add_epi32(hhdiff, h2tmp); // Add above to diff high
hhdiff = _mm_add_epi32(hhdiff, hadd); // Add shuffled diff low to diff high
prevadd = _mm_shuffle_epi32(hhdiff, 0xff); // Shuffle diff high
// Store
_mm_store_si128((__m128i*)(out), lldiff); // Store low diff low in memory
_mm_store_si128((__m128i*)(out+4), lhdiff); // Store low diff high in memory
_mm_store_si128((__m128i*)(out+8), hldiff); // Store high diff low in memory
_mm_store_si128((__m128i*)(out+12), hhdiff); // Store high diff high in memory
// Increment
out += 16;
p1 += 16;
p2 += 16;
}
if (y > 0)
{
out = integral + y*integral_stride;
for (int x = 0; x < dst_w; x += 16)
{
*((__m128i*)out) = _mm_add_epi32(*(__m128i*)(out-integral_stride),
*(__m128i*)(out));
*((__m128i*)(out+4)) = _mm_add_epi32(*(__m128i*)(out+4-integral_stride),
*(__m128i*)(out+4));
*((__m128i*)(out+8)) = _mm_add_epi32(*(__m128i*)(out+8-integral_stride),
*(__m128i*)(out+8));
*((__m128i*)(out+12)) = _mm_add_epi32(*(__m128i*)(out+12-integral_stride),
*(__m128i*)(out+12));
out += 16;
}
}
}
}
void nlmeans_init_x86(NLMeansFunctions *functions)
{
if (av_get_cpu_flags() & AV_CPU_FLAG_SSE2)
{
functions->build_integral = build_integral_sse2;
hb_log("NLMeans using SSE2 optimizations");
}
}
#endif // ARCH_X86
|