Subversion Repositories QNX 8.QNX8 LLVM/Clang compiler suite

Rev

Blame | Last modification | View Log | Download | RSS feed

  1. /*===------------- amxfp16intrin.h - AMX_FP16 intrinsics -*- C++ -*---------===
  2.  *
  3.  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4.  * See https://llvm.org/LICENSE.txt for license information.
  5.  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6.  *
  7.  *===------------------------------------------------------------------------===
  8.  */
  9.  
  10. #ifndef __IMMINTRIN_H
  11. #error "Never use <amxfp16intrin.h> directly; use <immintrin.h> instead."
  12. #endif /* __IMMINTRIN_H */
  13.  
  14. #ifndef __AMX_FP16INTRIN_H
  15. #define __AMX_FP16INTRIN_H
  16. #ifdef __x86_64__
  17.  
  18. /// Compute dot-product of FP16 (16-bit) floating-point pairs in tiles \a a
  19. ///    and \a b, accumulating the intermediate single-precision (32-bit)
  20. ///    floating-point elements with elements in \a dst, and store the 32-bit
  21. ///    result back to tile \a dst.
  22. ///
  23. /// \headerfile <immintrin.h>
  24. ///
  25. /// \code
  26. /// void _tile_dpfp16ps (__tile dst, __tile a, __tile b)
  27. /// \endcode
  28. ///
  29. /// \code{.operation}
  30. /// FOR m := 0 TO dst.rows - 1
  31. ///     tmp := dst.row[m]
  32. ///     FOR k := 0 TO (a.colsb / 4) - 1
  33. ///             FOR n := 0 TO (dst.colsb / 4) - 1
  34. ///                     tmp.fp32[n] += FP32(a.row[m].fp16[2*k+0]) *
  35. ///                                     FP32(b.row[k].fp16[2*n+0])
  36. ///                     tmp.fp32[n] += FP32(a.row[m].fp16[2*k+1]) *
  37. ///                                     FP32(b.row[k].fp16[2*n+1])
  38. ///             ENDFOR
  39. ///     ENDFOR
  40. ///     write_row_and_zero(dst, m, tmp, dst.colsb)
  41. /// ENDFOR
  42. /// zero_upper_rows(dst, dst.rows)
  43. /// zero_tileconfig_start()
  44. /// \endcode
  45. ///
  46. /// This intrinsic corresponds to the \c TDPFP16PS instruction.
  47. ///
  48. /// \param dst
  49. ///    The destination tile. Max size is 1024 Bytes.
  50. /// \param a
  51. ///    The 1st source tile. Max size is 1024 Bytes.
  52. /// \param b
  53. ///    The 2nd source tile. Max size is 1024 Bytes.
  54. #define _tile_dpfp16ps(dst, a, b)                                \
  55.   __builtin_ia32_tdpfp16ps(dst, a, b)
  56.  
  57. #endif /* __x86_64__ */
  58. #endif /* __AMX_FP16INTRIN_H */
  59.