MRPT  1.9.9
CImage_SSE3.cpp
Go to the documentation of this file.
1 /* +------------------------------------------------------------------------+
2  | Mobile Robot Programming Toolkit (MRPT) |
3  | https://www.mrpt.org/ |
4  | |
5  | Copyright (c) 2005-2019, Individual contributors, see AUTHORS file |
6  | See: https://www.mrpt.org/Authors - All rights reserved. |
7  | Released under BSD License. See: https://www.mrpt.org/License |
8  +------------------------------------------------------------------------+ */
9 
10 #include "img-precomp.h" // Precompiled headers
11 
12 #include <mrpt/config.h>
13 
14 // ---------------------------------------------------------------------------
15 // This file contains the SSE3/SSSE3 optimized functions for
16 // mrpt::img::CImage
17 // See the sources and the doxygen documentation page "sse_optimizations" for
18 // more details.
19 // ---------------------------------------------------------------------------
20 #if MRPT_HAS_SSE3
21 
22 #include <mrpt/core/SSE_macros.h>
23 #include <mrpt/core/SSE_types.h>
24 #include <mrpt/img/CImage.h>
25 #include <mrpt/system/memory.h>
26 #include "CImage_SSEx.h"
27 
28 /** \addtogroup sse_optimizations
29  * SSE optimized functions
30  * @{
31  */
32 
33 // This is the actual function behind image_SSSE3_scale_half_3c8u():
34 template <bool MemIsAligned>
36  const uint8_t* in, uint8_t* out, int w, int h, size_t step_in,
37  size_t step_out)
38 {
39  SSE_DISABLE_WARNINGS
40  // clang-format off
41 
42  const __m128i m0 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x0E, 0x0D, 0x0C, 0x08, 0x07, 0x06, 0x02, 0x01, 0x00);
43  const __m128i m1 = _mm_set_epi8(0x0E, 0x0A, 0x09, 0x08, 0x04, 0x03, 0x02, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
44  const __m128i m2 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x0C, 0x0B, 0x0A, 0x06, 0x05, 0x04, 0x00, 0x80);
45  const __m128i m3 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x0F);
46 
47  // clang-format on
48  SSE_RESTORE_SIGN_WARNINGS
49 
50  const int sw = w / 16; // This are the number of 3*16 blocks in each row
51  const int sh = h / 2;
52  const int rest_w = w - (16 * w);
53 
54  for (int i = 0; i < sh; i++)
55  {
56  const __m128i* inp = reinterpret_cast<const __m128i*>(in);
57  uint8_t* outp = out;
58 
59  for (int j = 0; j < sw; j++)
60  {
61  // 16-byte blocks #0,#1,#2:
62  __m128i d0 = mm_load_si128<MemIsAligned>(inp++);
63  __m128i d1 = mm_load_si128<MemIsAligned>(inp++);
64 
65  // First 16 bytes:
66  __m128i shuf0 = _mm_shuffle_epi8(d0, m0);
67  __m128i shuf1 = _mm_shuffle_epi8(d1, m1);
68 
69  __m128i res0 = _mm_or_si128(shuf0, shuf1);
70 
71  _mm_storeu_si128(
72  reinterpret_cast<__m128i*>(outp), res0); // aligned output
73  outp += 16;
74 
75  // Last 8 bytes:
76  __m128i d2 = mm_load_si128<MemIsAligned>(inp++);
77 
78  // Write lower 8 bytes only
79  _mm_storel_epi64(
80  reinterpret_cast<__m128i*>(outp),
81  _mm_or_si128(
82  _mm_shuffle_epi8(d2, m2), _mm_shuffle_epi8(d1, m3)));
83  outp += 8;
84  }
85 
86  // Extra pixels? (w mod 16 != 0)
87  if (rest_w != 0)
88  {
89  const uint8_t* in_rest = in + 3 * 16 * sw;
90  for (int p = 0; p < rest_w / 2; p++)
91  {
92  outp[0] = in_rest[0];
93  outp[1] = in_rest[1];
94  outp[2] = in_rest[2];
95  in_rest += 6;
96  outp += 3;
97  }
98  }
99 
100  in += 2 * step_in; // Skip one row
101  out += step_out;
102  }
103 }
104 
105 /** Subsample each 2x2 pixel block into 1x1 pixel, taking the first pixel &
106  * ignoring the other 3
107  * - <b>Input format:</b> uint8_t, 3 channels (RGB or BGR)
108  * - <b>Output format:</b> uint8_t, 3 channels (RGB or BGR)
109  * - <b>Preconditions:</b> in & out may be aligned to 16bytes (faster) or not,
110  * step may be k*16 (faster) or not.
111  * - <b>Notes:</b>
112  * - <b>Requires:</b> SSSE3
113  * - <b>Invoked from:</b> mrpt::img::CImage::scaleHalf()
114  */
116  const uint8_t* in, uint8_t* out, int w, int h, size_t step_in,
117  size_t step_out)
118 {
119  if (mrpt::system::is_aligned<16>(in) && mrpt::system::is_aligned<16>(out) &&
120  is_multiple<16>(step_in) && is_multiple<16>(step_out))
121  {
122  impl_image_SSSE3_scale_half_3c8u<true>(
123  in, out, w, h, step_in, step_out);
124  }
125  else
126  {
127  impl_image_SSSE3_scale_half_3c8u<false>(
128  in, out, w, h, step_in, step_out);
129  }
130 }
131 
132 // This is the actual function behind both: image_SSSE3_rgb_to_gray_8u() and
133 // image_SSSE3_bgr_to_gray_8u():
134 template <bool IS_RGB, bool MemIsAligned>
136  const uint8_t* in, uint8_t* out, int w, int h, size_t step_in,
137  size_t step_out)
138 {
139  SSE_DISABLE_WARNINGS
140  // clang-format off
141 
142  // Masks: 0 1 2 3 4 5 6 7 8 9 A B C D E F
143  // reds[0-7] from D0
144  const __m128i mask0 = _mm_setr_epi8(0x80, 0x00, 0x80, 0x03, 0x80, 0x06, 0x80, 0x09, 0x80, 0x0C, 0x80, 0x0F, 0x80, 0x80, 0x80, 0x80);
145  // reds[0-7] from D1
146  const __m128i mask1 = _mm_setr_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x02, 0x80, 0x05);
147  // greens[0-7] from D0
148  const __m128i mask2 = _mm_setr_epi8(0x80, 0x01, 0x80, 0x04, 0x80, 0x07, 0x80, 0x0A, 0x80, 0x0D, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
149  // greens[0-7] from D1
150  const __m128i mask3 = _mm_setr_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x00, 0x80, 0x03, 0x80, 0x06);
151  // blues[0-7] from D0
152  const __m128i mask4 = _mm_setr_epi8(0x80, 0x02, 0x80, 0x05, 0x80, 0x08, 0x80, 0x0B, 0x80, 0x0E, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
153  // blues[0-7] from D1
154  const __m128i mask5 = _mm_setr_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x01, 0x80, 0x04, 0x80, 0x07);
155  // reds[8-15] from D1
156  const __m128i mask6 = _mm_setr_epi8(0x80, 0x08, 0x80, 0x0B, 0x80, 0x0E, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
157  // reds[8-15] from D2
158  const __m128i mask7 = _mm_setr_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x01, 0x80, 0x04, 0x80, 0x07, 0x80, 0x0A, 0x80, 0x0D);
159  // greens[8-15] from D1
160  const __m128i mask8 = _mm_setr_epi8(0x80, 0x09, 0x80, 0x0C, 0x80, 0x0F, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
161  // greens[8-15] from D2
162  const __m128i mask9 = _mm_setr_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x02, 0x80, 0x05, 0x80, 0x08, 0x80, 0x0B, 0x80, 0x0E);
163  // blues[8-15] from D1
164  const __m128i mask10 = _mm_setr_epi8(0x80, 0x0A, 0x80, 0x0D, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
165  // blues[8-15] from D2
166  const __m128i mask11 = _mm_setr_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x00, 0x80, 0x03, 0x80, 0x06, 0x80, 0x09, 0x80, 0x0C, 0x80, 0x0F);
167  // Conversion factors for RGB->Y
168  const __m128i VAL_R = _mm_setr_epi8(0x00, 0x1D, 0x00, 0x1D, 0x00, 0x1D, 0x00, 0x1D, 0x00, 0x1D, 0x00, 0x1D, 0x00, 0x1D, 0x00, 0x1D);
169  const __m128i VAL_G = _mm_setr_epi8(0x00, 0x96, 0x00, 0x96, 0x00, 0x96, 0x00, 0x96, 0x00, 0x96, 0x00, 0x96, 0x00, 0x96, 0x00, 0x96);
170  const __m128i VAL_B = _mm_setr_epi8(0x00, 0x4D, 0x00, 0x4D, 0x00, 0x4D, 0x00, 0x4D, 0x00, 0x4D, 0x00, 0x4D, 0x00, 0x4D, 0x00, 0x4D);
171  // mask:
172  const __m128i mask_low = _mm_setr_epi8(0x01, 0x03, 0x05, 0x07, 0x09, 0x0B, 0x0D, 0x0F, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
173 
174  // clang-format on
175  SSE_RESTORE_SIGN_WARNINGS
176 
177  const __m128i m0 = IS_RGB ? mask4 : mask0;
178  const __m128i m1 = IS_RGB ? mask5 : mask1;
179  const __m128i m2 = mask2;
180  const __m128i m3 = mask3;
181  const __m128i m4 = IS_RGB ? mask0 : mask4;
182  const __m128i m5 = IS_RGB ? mask1 : mask5;
183  const __m128i m6 = IS_RGB ? mask10 : mask6;
184  const __m128i m7 = IS_RGB ? mask11 : mask7;
185  const __m128i m8 = mask8;
186  const __m128i m9 = mask9;
187  const __m128i m10 = IS_RGB ? mask6 : mask10;
188  const __m128i m11 = IS_RGB ? mask7 : mask11;
189 
190  const int sw = w >> 4; // This are the number of 3*16 blocks in each row
191  const int sh = h;
192 
193  for (int i = 0; i < sh; i++)
194  {
195  const __m128i* inp = reinterpret_cast<const __m128i*>(in);
196  uint8_t* outp = out;
197 
198  for (int j = 0; j < sw; j++)
199  {
200  // We process RGB data in blocks of 3 x 16byte blocks:
201  const __m128i d0 = mm_load_si128<MemIsAligned>(inp++);
202  const __m128i d1 = mm_load_si128<MemIsAligned>(inp++);
203  const __m128i d2 = mm_load_si128<MemIsAligned>(inp++);
204 
205  // First 8 bytes of gray levels:
206  {
207  const __m128i BLUES_0_7 = _mm_or_si128(
208  _mm_shuffle_epi8(d0, m0), _mm_shuffle_epi8(d1, m1));
209  const __m128i GREENS_0_7 = _mm_or_si128(
210  _mm_shuffle_epi8(d0, m2), _mm_shuffle_epi8(d1, m3));
211  const __m128i REDS_0_7 = _mm_or_si128(
212  _mm_shuffle_epi8(d0, m4), _mm_shuffle_epi8(d1, m5));
213 
214  // _mm_mulhi_epu16(): Multiplies the 8 unsigned 16-bit integers
215  // from a by the 8 unsigned 16-bit integers from b.
216  // r0 := (a0 * b0)[31:16]
217  // r1 := (a1 * b1)[31:16]
218  //...
219  // r7 := (a7 * b7)[31:16]
220  //
221  const __m128i GRAYS_0_7 = _mm_adds_epu16(
222  _mm_mulhi_epu16(REDS_0_7, VAL_R),
223  _mm_adds_epu16(
224  _mm_mulhi_epu16(GREENS_0_7, VAL_G),
225  _mm_mulhi_epu16(BLUES_0_7, VAL_B)));
226 
227  _mm_storel_epi64(
228  reinterpret_cast<__m128i*>(outp),
229  _mm_shuffle_epi8(GRAYS_0_7, mask_low));
230  outp += 8;
231  }
232 
233  // Second 8 bytes of gray levels:
234  {
235  const __m128i BLUES_8_15 = _mm_or_si128(
236  _mm_shuffle_epi8(d1, m6), _mm_shuffle_epi8(d2, m7));
237  const __m128i GREENS_8_15 = _mm_or_si128(
238  _mm_shuffle_epi8(d1, m8), _mm_shuffle_epi8(d2, m9));
239  const __m128i REDS_8_15 = _mm_or_si128(
240  _mm_shuffle_epi8(d1, m10), _mm_shuffle_epi8(d2, m11));
241 
242  const __m128i GRAYS_8_15 = _mm_adds_epu16(
243  _mm_mulhi_epu16(REDS_8_15, VAL_R),
244  _mm_adds_epu16(
245  _mm_mulhi_epu16(GREENS_8_15, VAL_G),
246  _mm_mulhi_epu16(BLUES_8_15, VAL_B)));
247 
248  _mm_storel_epi64(
249  reinterpret_cast<__m128i*>(outp),
250  _mm_shuffle_epi8(GRAYS_8_15, mask_low));
251  outp += 8;
252  }
253  }
254  in += step_in;
255  out += step_out;
256  }
257 
258 } // end impl_image_SSSE3_rgb_or_bgr_to_gray_8u()
259 
260 /** Convert a RGB image (3cu8) into a GRAYSCALE (1c8u) image, using
261  * Y=77*R+150*G+29*B
262  * - <b>Input format:</b> uint8_t, 3 channels (BGR order)
263  * - <b>Output format:</b> uint8_t, 1 channel
264  * - <b>Preconditions:</b> in & out aligned to 16bytes (faster) or not, step =
265  * k*16
266  * - <b>Notes:</b>
267  * - <b>Requires:</b> SSSE3
268  * - <b>Invoked from:</b> mrpt::img::CImage::grayscale(),
269  * mrpt::img::CImage::grayscaleInPlace()
270  */
272  const uint8_t* in, uint8_t* out, int w, int h, size_t step_in,
273  size_t step_out)
274 {
275  ASSERTMSG_((step_in & 0x0f) == 0, "step of input image must be 16*k");
276  ASSERTMSG_((step_out & 0x0f) == 0, "step of output image must be 16*k");
277 
278  if (mrpt::system::is_aligned<16>(in) && mrpt::system::is_aligned<16>(out))
279  {
280  impl_image_SSSE3_rgb_or_bgr_to_gray_8u<false, true>(
281  in, out, w, h, step_in, step_out);
282  }
283  else
284  {
285  impl_image_SSSE3_rgb_or_bgr_to_gray_8u<false, false>(
286  in, out, w, h, step_in, step_out);
287  }
288 }
289 
290 /** Convert a RGB image (3cu8) into a GRAYSCALE (1c8u) image, using
291  * Y=77*R+150*G+29*B
292  * - <b>Input format:</b> uint8_t, 3 channels (RGB order)
293  * - <b>Output format:</b> uint8_t, 1 channel
294  * - <b>Preconditions:</b> in & out aligned to 16bytes (faster) or not, step =
295  * k*16
296  * - <b>Notes:</b>
297  * - <b>Requires:</b> SSSE3
298  * - <b>Invoked from:</b> mrpt::img::CImage::grayscale(),
299  * mrpt::img::CImage::grayscaleInPlace()
300  */
302  const uint8_t* in, uint8_t* out, int w, int h, size_t step_in,
303  size_t step_out)
304 {
305  ASSERTMSG_((step_in & 0x0f) == 0, "step of input image must be 16*k");
306  ASSERTMSG_((step_out & 0x0f) == 0, "step of output image must be 16*k");
307 
308  if (mrpt::system::is_aligned<16>(in) && mrpt::system::is_aligned<16>(out))
309  {
310  impl_image_SSSE3_rgb_or_bgr_to_gray_8u<true, true>(
311  in, out, w, h, step_in, step_out);
312  }
313  else
314  {
315  impl_image_SSSE3_rgb_or_bgr_to_gray_8u<true, false>(
316  in, out, w, h, step_in, step_out);
317  }
318 }
319 
320 /** @} */
321 
322 #endif // end of MRPT_HAS_SSE3
void impl_image_SSSE3_rgb_or_bgr_to_gray_8u(const uint8_t *in, uint8_t *out, int w, int h, size_t step_in, size_t step_out)
void image_SSSE3_scale_half_3c8u(const uint8_t *in, uint8_t *out, int w, int h, size_t step_in, size_t step_out)
Subsample each 2x2 pixel block into 1x1 pixel, taking the first pixel & ignoring the other 3...
GLubyte GLubyte GLubyte GLubyte w
Definition: glext.h:4199
unsigned char uint8_t
Definition: rptypes.h:44
void impl_image_SSSE3_scale_half_3c8u(const uint8_t *in, uint8_t *out, int w, int h, size_t step_in, size_t step_out)
Definition: CImage_SSE3.cpp:35
#define ASSERTMSG_(f, __ERROR_MSG)
Defines an assertion mechanism.
Definition: exceptions.h:108
GLuint in
Definition: glext.h:7391
void image_SSSE3_bgr_to_gray_8u(const uint8_t *in, uint8_t *out, int w, int h, size_t step_in, size_t step_out)
Convert a RGB image (3cu8) into a GRAYSCALE (1c8u) image, using Y=77*R+150*G+29*B.
GLfloat GLfloat p
Definition: glext.h:6398
void image_SSSE3_rgb_to_gray_8u(const uint8_t *in, uint8_t *out, int w, int h, size_t step_in, size_t step_out)
Convert a RGB image (3cu8) into a GRAYSCALE (1c8u) image, using Y=77*R+150*G+29*B.



Page generated by Doxygen 1.8.14 for MRPT 1.9.9 Git: 8fe78517f Sun Jul 14 19:43:28 2019 +0200 at lun oct 28 02:10:00 CET 2019