CImage_SSE2.cpp Source File

Go to the documentation of this file.
 /* +------------------------------------------------------------------------+
    |                     Mobile Robot Programming Toolkit (MRPT)            |
    |                          http://www.mrpt.org/                          |
    |                                                                        |
    | Copyright (c) 2005-2018, Individual contributors, see AUTHORS file     |
    | See: http://www.mrpt.org/Authors - All rights reserved.                |
    | Released under BSD License. See details in http://www.mrpt.org/License |
    +------------------------------------------------------------------------+ */
  
 #include "img-precomp.h"  // Precompiled headers
  
 #if MRPT_HAS_SSE2
 // ---------------------------------------------------------------------------
 //   This file contains the SSE2 optimized functions for mrpt::img::CImage
 //    See the sources and the doxygen documentation page "sse_optimizations" for
 //    more details.
 //
 //  Some functions here are derived from sources in libcvd, released
 //   under BSD. https://www.edwardrosten.com/cvd/
 //
 // ---------------------------------------------------------------------------
  
 #include <mrpt/img/CImage.h>
 #include <mrpt/core/SSE_types.h>
 #include <mrpt/core/SSE_macros.h>
 #include "CImage_SSEx.h"
  
 /** \addtogroup sse_optimizations
  *  SSE optimized functions
  *  @{
  */
  
 /** Subsample each 2x2 pixel block into 1x1 pixel, taking the first pixel &
  * ignoring the other 3
  *  - <b>Input format:</b> uint8_t, 1 channel
  *  - <b>Output format:</b> uint8_t, 1 channel
  *  - <b>Preconditions:</b> in & out aligned to 16bytes, w = k*16 (w=width in
  * pixels), widthStep=w*1
  *  - <b>Notes:</b>
  *  - <b>Requires:</b> SSE2
  *  - <b>Invoked from:</b> mrpt::img::CImage::scaleHalf()
  */
 void image_SSE2_scale_half_1c8u(const uint8_t* in, uint8_t* out, int w, int h)
 {
         alignas(32) const unsigned long long mask[2] = {0x00FF00FF00FF00FFull,
                                                                                                         0x00FF00FF00FF00FFull};
         const __m128i m = _mm_load_si128((const __m128i*)mask);
  
         int sw = w >> 4;
         int sh = h >> 1;
  
         for (int i = 0; i < sh; i++)
         {
                 for (int j = 0; j < sw; j++)
                 {
                         const __m128i here_sampled =
                                 _mm_and_si128(_mm_load_si128((const __m128i*)in), m);
                         _mm_storel_epi64(
                                 (__m128i*)out, _mm_packus_epi16(here_sampled, here_sampled));
                         in += 16;
                         out += 8;
                 }
                 in += w;
         }
 }
  
 /** Average each 2x2 pixels into 1x1 pixel (arithmetic average)
  *  - <b>Input format:</b> uint8_t, 1 channel
  *  - <b>Output format:</b> uint8_t, 1 channel
  *  - <b>Preconditions:</b> in & out aligned to 16bytes, w = k*16 (w=width in
  * pixels), widthStep=w*1
  *  - <b>Notes:</b>
  *  - <b>Requires:</b> SSE2
  *  - <b>Invoked from:</b> mrpt::img::CImage::scaleHalfSmooth()
  */
 void image_SSE2_scale_half_smooth_1c8u(
         const uint8_t* in, uint8_t* out, int w, int h)
 {
         alignas(MRPT_MAX_ALIGN_BYTES) const unsigned long long mask[2] = {0x00FF00FF00FF00FFull,
                                                                                                         0x00FF00FF00FF00FFull};
         const uint8_t* nextRow = in + w;
         __m128i m = _mm_load_si128((const __m128i*)mask);
         int sw = w >> 4;
         int sh = h >> 1;
  
         for (int i = 0; i < sh; i++)
         {
                 for (int j = 0; j < sw; j++)
                 {
                         __m128i here = _mm_load_si128((const __m128i*)in);
                         __m128i next = _mm_load_si128((const __m128i*)nextRow);
                         here = _mm_avg_epu8(here, next);
                         next = _mm_and_si128(_mm_srli_si128(here, 1), m);
                         here = _mm_and_si128(here, m);
                         here = _mm_avg_epu16(here, next);
                         _mm_storel_epi64((__m128i*)out, _mm_packus_epi16(here, here));
                         in += 16;
                         nextRow += 16;
                         out += 8;
                 }
  
                 in += w;
                 nextRow += w;
         }
 }
  
 /** KLT score at a given point of a grayscale image.
  *  - <b>Requires:</b> SSE2
  *  - <b>Invoked from:</b> mrpt::img::CImage::KLT_response()
  *
  *  This function is not manually optimized for SSE2 but templatized for
  * different
  *   window sizes such as the compiler can optimize automatically for that
  * size.
  *
  *  Only for the most common window sizes this templates are instantiated
  * (W=[2-16] and W=32 ),
  *   falling back to
  *   a generic implementation otherwise. The next figure shows the performance
  * (time for
  *   KLT_response() to compute the score for one single pixel) for different
  * window sizes.
  *
  *  <img src="KLT_response_performance_SSE2.png" >
  *
  */
 float KLT_response_optimized();
  
 // TODO:
 // Sum of absolute differences: Use  _mm_sad_epu8
  
 /**  @} */
  
 #endif  // end if MRPT_HAS_SSE2