contrib/patch-x264-linux.patch


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194

Index: /common/ppc/quant.c
===================================================================
--- /common/ppc/quant.c (revision 601)
+++ /common/ppc/quant.c (revision 621)
@@ -18,8 +18,4 @@
 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
 *****************************************************************************/
-
-#ifdef HAVE_ALTIVEC_H
-#include <altivec.h>
-#endif
 
 #include "common/common.h"
@@ -54,29 +50,29 @@
 temp2v = vec_xor(temp2v, mskB);                                              \
 temp1v = vec_adds(temp1v, vec_and(mskA, one));                                \
-vec_st(temp1v, (dct0), dct);                                                 \
+vec_st(temp1v, (dct0), (int16_t*)dct);                                        \
 temp2v = vec_adds(temp2v, vec_and(mskB, one));                                \
-vec_st(temp2v, (dct1), dct);
+vec_st(temp2v, (dct1), (int16_t*)dct);
                 
 void x264_quant_4x4_altivec( int16_t dct[4][4], int quant_mf[4][4], int const i_qbits, int const f ) {
     vector bool short mskA;
-    vec_s32_t i_qbitsv;
+    vec_u32_t i_qbitsv;
     vec_u16_t coefvA;
     vec_u32_t multEvenvA, multOddvA;
-    vec_u32_t mfvA;
+    vec_u16_t mfvA;
     vec_s16_t zerov, one;
-    vec_s32_t fV;
+    vec_u32_t fV;
 
     vector bool short mskB;
     vec_u16_t coefvB;
     vec_u32_t multEvenvB, multOddvB;
-    vec_u32_t mfvB;
+    vec_u16_t mfvB;
 
     vec_s16_t temp1v, temp2v;
 
-    vect_sint_u qbits_u;
+    vect_int_u qbits_u;
     qbits_u.s[0]=i_qbits;
     i_qbitsv = vec_splat(qbits_u.v, 0);
 
-    vect_sint_u f_u;
+    vect_int_u f_u;
     f_u.s[0]=f;
 
@@ -114,16 +110,16 @@
 temp2v = vec_xor(temp2v, mskB);                                 \
 temp1v = vec_add(temp1v, vec_and(mskA, one));                   \
-vec_st(temp1v, (dct0), dct);                                    \
+vec_st(temp1v, (dct0), (int16_t*)dct);                          \
 temp2v = vec_add(temp2v, vec_and(mskB, one));                   \
-vec_st(temp2v, (dct1), dct);
+vec_st(temp2v, (dct1), (int16_t*)dct);
 
 
 void x264_quant_4x4_dc_altivec( int16_t dct[4][4], int i_quant_mf, int const i_qbits, int const f ) {
     vector bool short mskA;
-    vec_s32_t i_qbitsv;
+    vec_u32_t i_qbitsv;
     vec_u16_t coefvA;
     vec_u32_t multEvenvA, multOddvA;
     vec_s16_t zerov, one;
-    vec_s32_t fV;
+    vec_u32_t fV;
 
     vector bool short mskB;
@@ -133,15 +129,14 @@
     vec_s16_t temp1v, temp2v;
 
-    vec_u32_t mfv;
-    vect_int_u mf_u;
+    vec_u16_t mfv;
+    vect_ushort_u mf_u;
     mf_u.s[0]=i_quant_mf;
     mfv = vec_splat( mf_u.v, 0 );
-    mfv = vec_packs( mfv, mfv);
 
-    vect_sint_u qbits_u;
+    vect_int_u qbits_u;
     qbits_u.s[0]=i_qbits;
     i_qbitsv = vec_splat(qbits_u.v, 0);
 
-    vect_sint_u f_u;
+    vect_int_u f_u;
     f_u.s[0]=f;
     fV = vec_splat(f_u.v, 0);
@@ -156,13 +151,15 @@
 void x264_quant_8x8_altivec( int16_t dct[8][8], int quant_mf[8][8], int const i_qbits, int const f ) {
     vector bool short mskA;
-    vec_s32_t i_qbitsv;
+    vec_u32_t i_qbitsv;
     vec_u16_t coefvA;
-    vec_s32_t multEvenvA, multOddvA, mfvA;
+    vec_u32_t multEvenvA, multOddvA;
+    vec_u16_t mfvA;
     vec_s16_t zerov, one;
-    vec_s32_t fV;
+    vec_u32_t fV;
     
     vector bool short mskB;
     vec_u16_t coefvB;
-    vec_u32_t multEvenvB, multOddvB, mfvB;
+    vec_u32_t multEvenvB, multOddvB;
+    vec_u16_t mfvB;
     
     vec_s16_t temp1v, temp2v;
@@ -172,5 +169,5 @@
     i_qbitsv = vec_splat(qbits_u.v, 0);
 
-    vect_sint_u f_u;
+    vect_int_u f_u;
     f_u.s[0]=f;
     fV = vec_splat(f_u.v, 0);
Index: /common/ppc/dct.c
===================================================================
--- /common/ppc/dct.c (revision 604)
+++ /common/ppc/dct.c (revision 621)
@@ -61,6 +61,6 @@
     VEC_DCT( dct0v, dct1v, dct2v, dct3v, tmp0v, tmp1v, tmp2v, tmp3v );
 
-    vec_st(vec_perm(tmp0v, tmp1v, permHighv), 0, dct);
-    vec_st(vec_perm(tmp2v, tmp3v, permHighv), 16, dct);
+    vec_st(vec_perm(tmp0v, tmp1v, permHighv), 0,  (int16_t*)dct);
+    vec_st(vec_perm(tmp2v, tmp3v, permHighv), 16, (int16_t*)dct);
 }
 
@@ -95,12 +95,12 @@
     VEC_DCT( dct4v, dct5v, dct6v, dct7v, tmp4v, tmp5v, tmp6v, tmp7v );
 
-    vec_st(vec_perm(tmp0v, tmp1v, permHighv), 0, dct);
-    vec_st(vec_perm(tmp2v, tmp3v, permHighv), 16, dct);
-    vec_st(vec_perm(tmp4v, tmp5v, permHighv), 32, dct);
-    vec_st(vec_perm(tmp6v, tmp7v, permHighv), 48, dct);
-    vec_st(vec_perm(tmp0v, tmp1v, permLowv),  64, dct);
-    vec_st(vec_perm(tmp2v, tmp3v, permLowv), 80, dct);
-    vec_st(vec_perm(tmp4v, tmp5v, permLowv), 96, dct);
-    vec_st(vec_perm(tmp6v, tmp7v, permLowv), 112, dct);
+    vec_st(vec_perm(tmp0v, tmp1v, permHighv), 0,   (int16_t*)dct);
+    vec_st(vec_perm(tmp2v, tmp3v, permHighv), 16,  (int16_t*)dct);
+    vec_st(vec_perm(tmp4v, tmp5v, permHighv), 32,  (int16_t*)dct);
+    vec_st(vec_perm(tmp6v, tmp7v, permHighv), 48,  (int16_t*)dct);
+    vec_st(vec_perm(tmp0v, tmp1v, permLowv),  64,  (int16_t*)dct);
+    vec_st(vec_perm(tmp2v, tmp3v, permLowv),  80,  (int16_t*)dct);
+    vec_st(vec_perm(tmp4v, tmp5v, permLowv),  96,  (int16_t*)dct);
+    vec_st(vec_perm(tmp6v, tmp7v, permLowv),  112, (int16_t*)dct);
 }
 
@@ -312,6 +312,6 @@
 void x264_add8x8_idct8_altivec( uint8_t *dst, int16_t dct[8][8] )
 {
-    vec_s16_t onev = vec_splat_s16(1);
-    vec_s16_t twov = vec_splat_s16(2);
+    vec_u16_t onev = vec_splat_s16(1);
+    vec_u16_t twov = vec_splat_s16(2);
 
     dct[0][0] += 32; // rounding for the >>6 at the end
@@ -342,5 +342,5 @@
     vec_u8_t perm_ldv = vec_lvsl(0, dst);
     vec_u8_t perm_stv = vec_lvsr(8, dst);
-    vec_s16_t sixv = vec_splat_s16(6);
+    vec_u16_t sixv = vec_splat_s16(6);
     const vec_u8_t sel = (vec_u8_t) CV(0,0,0,0,0,0,0,0,-1,-1,-1,-1,-1,-1,-1,-1);
     LOAD_ZERO;
Index: /common/ppc/quant.h
===================================================================
--- /common/ppc/quant.h (revision 601)
+++ /common/ppc/quant.h (revision 621)
@@ -19,4 +19,8 @@
 *****************************************************************************/
 
+#ifdef SYS_LINUX
+#include <altivec.h>
+#endif
+
 #ifndef _PPC_QUANT_H
 #define _PPC_QUANT_H 1
@@ -28,8 +32,7 @@
 
 typedef union {
-  signed int s[4];
-  vector signed int v;
-} vect_sint_u;
-
+  unsigned short s[8];
+  vector unsigned short v;
+} vect_ushort_u;
 
 void x264_quant_4x4_altivec( int16_t dct[4][4], int quant_mf[4][4], int const i_qbits, int const f );