vc4: Declare the cpu pointers as being modified in NEON asm.

Otherwise, the compiler is free to reuse the register containing the input for another call and assume that the value hasn't been modified. Fixes crashes on texture upload/download with current gcc. We now have to have a temporary for the cpu2 value, since outputs must be lvalues. (commit message by anholt) Fixes: 4d30024238ef ("vc4: Use NEON to speed up utile loads on Pi2.")
author: Carsten Haitzler (Rasterman) <[email protected]> 2019-01-08 16:28:30 +0000
committer: Eric Anholt <[email protected]> 2019-01-28 16:45:45 -0800
commit: 300d3ae8b1445b5060f92c77c0f577f4b7b2c7d6 (patch)
tree: 9f0a9536356509d419c393adc4712d3413d1ea63 /src/broadcom/common
parent: 522f68847152e9111def094f7fb35b44f3d0fc80 (diff)
1 files changed, 15 insertions, 18 deletions
diff --git a/src/broadcom/common/v3d_cpu_tiling.h b/src/broadcom/common/v3d_cpu_tiling.h
index 246d3d6d3ab..e10b4586609 100644
--- a/src/broadcom/common/v3d_cpu_tiling.h
+++ b/src/broadcom/common/v3d_cpu_tiling.h
@@ -49,13 +49,13 @@ v3d_load_utile(void *cpu, uint32_t cpu_stride,
                         "vst1.8 d5, [%[cpu]], %[cpu_stride]\n"
                         "vst1.8 d6, [%[cpu]], %[cpu_stride]\n"
                         "vst1.8 d7, [%[cpu]]\n"
-                        :
+                        : [cpu]         "+r"(cpu)
                         : [gpu]         "r"(gpu),
-                          [cpu]         "r"(cpu),
                           [cpu_stride]  "r"(cpu_stride)
                         : "q0", "q1", "q2", "q3");
                 return;
         } else if (gpu_stride == 16) {
+                void *cpu2 = cpu + 8;
                 __asm__ volatile (
                         /* Load from the GPU in one shot, no interleave, to
                          * d0-d7.
@@ -73,10 +73,9 @@ v3d_load_utile(void *cpu, uint32_t cpu_stride,
                         "vst1.8 d5, [%[cpu2]],%[cpu_stride]\n"
                         "vst1.8 d6, [%[cpu]]\n"
                         "vst1.8 d7, [%[cpu2]]\n"
-                        :
+                        : [cpu]         "+r"(cpu),
+                          [cpu2]        "+r"(cpu2)
                         : [gpu]         "r"(gpu),
-                          [cpu]         "r"(cpu),
-                          [cpu2]        "r"(cpu + 8),
                           [cpu_stride]  "r"(cpu_stride)
                         : "q0", "q1", "q2", "q3");
                 return;
@@ -99,13 +98,13 @@ v3d_load_utile(void *cpu, uint32_t cpu_stride,
                         "st1 {v2.D}[1], [%[cpu]], %[cpu_stride]\n"
                         "st1 {v3.D}[0], [%[cpu]], %[cpu_stride]\n"
                         "st1 {v3.D}[1], [%[cpu]]\n"
-                        :
+                        : [cpu]         "+r"(cpu)
                         : [gpu]         "r"(gpu),
-                          [cpu]         "r"(cpu),
                           [cpu_stride]  "r"(cpu_stride)
                         : "v0", "v1", "v2", "v3");
                 return;
         } else if (gpu_stride == 16) {
+                void *cpu2 = cpu + 8;
                 __asm__ volatile (
                         /* Load from the GPU in one shot, no interleave, to
                          * d0-d7.
@@ -123,10 +122,9 @@ v3d_load_utile(void *cpu, uint32_t cpu_stride,
                         "st1 {v2.D}[1], [%[cpu2]],%[cpu_stride]\n"
                         "st1 {v3.D}[0], [%[cpu]]\n"
                         "st1 {v3.D}[1], [%[cpu2]]\n"
-                        :
+                        : [cpu]         "+r"(cpu),
+                          [cpu2]        "+r"(cpu2)
                         : [gpu]         "r"(gpu),
-                          [cpu]         "r"(cpu),
-                          [cpu2]        "r"(cpu + 8),
                           [cpu_stride]  "r"(cpu_stride)
                         : "v0", "v1", "v2", "v3");
                 return;
@@ -168,6 +166,7 @@ v3d_store_utile(void *gpu, uint32_t gpu_stride,
                         : "q0", "q1", "q2", "q3");
                 return;
         } else if (gpu_stride == 16) {
+                void *cpu2 = cpu + 8;
                 __asm__ volatile (
                         /* Load each 16-byte line in 2 parts from the cpu-side
                          * destination.  (vld1 can only store one d-register
@@ -183,10 +182,9 @@ v3d_store_utile(void *gpu, uint32_t gpu_stride,
                         "vld1.8 d7, [%[cpu2]]\n"
                         /* Store to the GPU in one shot, no interleave. */
                         "vstm %[gpu], {q0, q1, q2, q3}\n"
-                        :
+                        : [cpu]         "+r"(cpu),
+                          [cpu2]        "+r"(cpu2)
                         : [gpu]         "r"(gpu),
-                          [cpu]         "r"(cpu),
-                          [cpu2]        "r"(cpu + 8),
                           [cpu_stride]  "r"(cpu_stride)
                         : "q0", "q1", "q2", "q3");
                 return;
@@ -207,13 +205,13 @@ v3d_store_utile(void *gpu, uint32_t gpu_stride,
                         "ld1 {v3.D}[1], [%[cpu]]\n"
                         /* Store to the GPU in one shot, no interleave. */
                         "st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%[gpu]]\n"
-                        :
+                        : [cpu]         "+r"(cpu)
                         : [gpu]         "r"(gpu),
-                          [cpu]         "r"(cpu),
                           [cpu_stride]  "r"(cpu_stride)
                         : "v0", "v1", "v2", "v3");
                 return;
         } else if (gpu_stride == 16) {
+                void *cpu2 = cpu + 8;
                 __asm__ volatile (
                         /* Load each 16-byte line in 2 parts from the cpu-side
                          * destination.  (vld1 can only store one d-register
@@ -229,10 +227,9 @@ v3d_store_utile(void *gpu, uint32_t gpu_stride,
                         "ld1 {v3.D}[1], [%[cpu2]]\n"
                         /* Store to the GPU in one shot, no interleave. */
                         "st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%[gpu]]\n"
-                        :
+                        : [cpu]         "+r"(cpu),
+                          [cpu2]        "+r"(cpu2)
                         : [gpu]         "r"(gpu),
-                          [cpu]         "r"(cpu),
-                          [cpu2]        "r"(cpu + 8),
                           [cpu_stride]  "r"(cpu_stride)
                         : "v0", "v1", "v2", "v3");
                 return;
author	Carsten Haitzler (Rasterman) <[email protected]>	2019-01-08 16:28:30 +0000
committer	Eric Anholt <[email protected]>	2019-01-28 16:45:45 -0800
commit	300d3ae8b1445b5060f92c77c0f577f4b7b2c7d6 (patch)
tree	9f0a9536356509d419c393adc4712d3413d1ea63 /src/broadcom/common
parent	522f68847152e9111def094f7fb35b44f3d0fc80 (diff)