Main MRPT website > C++ reference for MRPT 1.5.7
CImage_SSE2.cpp
Go to the documentation of this file.
1 /* +---------------------------------------------------------------------------+
2  | Mobile Robot Programming Toolkit (MRPT) |
3  | http://www.mrpt.org/ |
4  | |
5  | Copyright (c) 2005-2017, Individual contributors, see AUTHORS file |
6  | See: http://www.mrpt.org/Authors - All rights reserved. |
7  | Released under BSD License. See details in http://www.mrpt.org/License |
8  +---------------------------------------------------------------------------+ */
9 
10 #include "base-precomp.h" // Precompiled headers
11 
12 #if MRPT_HAS_SSE2
13 // ---------------------------------------------------------------------------
14 // This file contains the SSE2 optimized functions for mrpt::utils::CImage
15 // See the sources and the doxygen documentation page "sse_optimizations" for more details.
16 //
17 // Some functions here are derived from sources in libcvd, released
18 // under LGPL. See http://mi.eng.cam.ac.uk/~er258/cvd/
19 //
20 // ---------------------------------------------------------------------------
21 
22 #include <mrpt/utils/CImage.h>
23 #include <mrpt/utils/SSE_types.h>
24 #include <mrpt/utils/SSE_macros.h>
25 #include "CImage_SSEx.h"
26 
27 /** \addtogroup sse_optimizations
28  * SSE optimized functions
29  * @{
30  */
31 
32 /** Subsample each 2x2 pixel block into 1x1 pixel, taking the first pixel & ignoring the other 3
33  * - <b>Input format:</b> uint8_t, 1 channel
34  * - <b>Output format:</b> uint8_t, 1 channel
35  * - <b>Preconditions:</b> in & out aligned to 16bytes, w = k*16 (w=width in pixels), widthStep=w*1
36  * - <b>Notes:</b>
37  * - <b>Requires:</b> SSE2
38  * - <b>Invoked from:</b> mrpt::utils::CImage::scaleHalf()
39  */
40 void image_SSE2_scale_half_1c8u(const uint8_t* in, uint8_t* out, int w, int h)
41 {
42  MRPT_ALIGN16 const unsigned long long mask[2] = {0x00FF00FF00FF00FFull, 0x00FF00FF00FF00FFull};
43  const __m128i m = _mm_load_si128((const __m128i*)mask);
44 
45  int sw = w >> 4;
46  int sh = h >> 1;
47 
48  for (int i=0; i<sh; i++)
49  {
50  for (int j=0; j<sw; j++)
51  {
52  const __m128i here_sampled = _mm_and_si128( _mm_load_si128((const __m128i*)in), m);
53  _mm_storel_epi64((__m128i*)out, _mm_packus_epi16(here_sampled,here_sampled));
54  in += 16;
55  out += 8;
56  }
57  in += w;
58  }
59 }
60 
61 
62 /** Average each 2x2 pixels into 1x1 pixel (arithmetic average)
63  * - <b>Input format:</b> uint8_t, 1 channel
64  * - <b>Output format:</b> uint8_t, 1 channel
65  * - <b>Preconditions:</b> in & out aligned to 16bytes, w = k*16 (w=width in pixels), widthStep=w*1
66  * - <b>Notes:</b>
67  * - <b>Requires:</b> SSE2
68  * - <b>Invoked from:</b> mrpt::utils::CImage::scaleHalfSmooth()
69  */
70 void image_SSE2_scale_half_smooth_1c8u(const uint8_t* in, uint8_t* out, int w, int h)
71 {
72  MRPT_ALIGN16 const unsigned long long mask[2] = {0x00FF00FF00FF00FFull, 0x00FF00FF00FF00FFull};
73  const uint8_t* nextRow = in + w;
74  __m128i m = _mm_load_si128((const __m128i*)mask);
75  int sw = w >> 4;
76  int sh = h >> 1;
77 
78  for (int i=0; i<sh; i++)
79  {
80  for (int j=0; j<sw; j++)
81  {
82  __m128i here = _mm_load_si128((const __m128i*)in);
83  __m128i next = _mm_load_si128((const __m128i*)nextRow);
84  here = _mm_avg_epu8(here,next);
85  next = _mm_and_si128(_mm_srli_si128(here,1), m);
86  here = _mm_and_si128(here,m);
87  here = _mm_avg_epu16(here, next);
88  _mm_storel_epi64((__m128i*)out, _mm_packus_epi16(here,here));
89  in += 16;
90  nextRow += 16;
91  out += 8;
92  }
93 
94  in += w;
95  nextRow += w;
96  }
97 }
98 
99 
100 
101 /** KLT score at a given point of a grayscale image.
102  * - <b>Requires:</b> SSE2
103  * - <b>Invoked from:</b> mrpt::utils::CImage::KLT_response()
104  *
105  * This function is not manually optimized for SSE2 but templatized for different
106  * window sizes such as the compiler can optimize automatically for that size.
107  *
108  * Only for the most common window sizes this templates are instantiated (W=[2-16] and W=32 ),
109  * falling back to
110  * a generic implementation otherwise. The next figure shows the performance (time for
111  * KLT_response() to compute the score for one single pixel) for different window sizes.
112  *
113  * <img src="KLT_response_performance_SSE2.png" >
114  *
115  */
116 float KLT_response_optimized();
117 
118 // TODO:
119 // Sum of absolute differences: Use _mm_sad_epu8
120 
121 /** @} */
122 
123 #endif // end if MRPT_HAS_SSE2
GLenum GLint GLuint mask
Definition: glext.h:3888
GLubyte GLubyte GLubyte GLubyte w
Definition: glext.h:3962
unsigned char uint8_t
Definition: rptypes.h:43
void image_SSE2_scale_half_smooth_1c8u(const uint8_t *in, uint8_t *out, int w, int h)
Average each 2x2 pixels into 1x1 pixel (arithmetic average)
Definition: CImage_SSE2.cpp:70
void image_SSE2_scale_half_1c8u(const uint8_t *in, uint8_t *out, int w, int h)
Subsample each 2x2 pixel block into 1x1 pixel, taking the first pixel & ignoring the other 3...
Definition: CImage_SSE2.cpp:40
GLuint in
Definition: glext.h:6301
float KLT_response_optimized()
KLT score at a given point of a grayscale image.
#define MRPT_ALIGN16



Page generated by Doxygen 1.8.14 for MRPT 1.5.7 Git: 5902e14cc Wed Apr 24 15:04:01 2019 +0200 at lun oct 28 01:39:17 CET 2019