Lift
Library of parallel computing primitives for GPUs and multi-core CPUs
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros
x86_64_cpuid.cu
Go to the documentation of this file.
1 /*
2  * Lift
3  *
4  * Copyright (c) 2014-2015, NVIDIA CORPORATION
5  * Copyright (c) 2015, Nuno Subtil <subtil@gmail.com>
6  * Copyright (c) 2015, Roche Molecular Systems Inc.
7  * All rights reserved.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions are met:
11  * * Redistributions of source code must retain the above copyright
12  * notice, this list of conditions and the following disclaimer.
13  * * Redistributions in binary form must reproduce the above copyright
14  * notice, this list of conditions and the following disclaimer in the
15  * documentation and/or other materials provided with the distribution.
16  * * Neither the name of the copyright holders nor the names of its
17  * contributors may be used to endorse or promote products derived from
18  * this software without specific prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
22  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23  * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27  * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 #include <tbb/task_scheduler_init.h>
33 
34 #include <lift/types.h>
37 
38 #include "x86_64_cpuid.h"
39 
40 namespace lift {
41 
42 namespace x86_64 {
43 
44 struct cpuid_regs
45 {
46  unsigned int eax;
47  unsigned int ebx;
48  unsigned int ecx;
49  unsigned int edx;
50 };
51 
52 static inline void cpuid(cpuid_regs& output,
53  unsigned int code,
54  unsigned int count = 0)
55 {
56  __asm__ __volatile__ ("cpuid"
57  : "=a" (output.eax),
58  "=b" (output.ebx),
59  "=c" (output.ecx),
60  "=d" (output.edx)
61  : "0" (code),
62  "2" (count));
63 }
64 
65 static inline unsigned int cpuid_max(unsigned int extended = 0)
66 {
67  cpuid_regs regs;
68  cpuid(regs, 0);
69  return regs.eax;
70 }
71 
72 static inline unsigned int cpuid_max_extended(void)
73 {
74  cpuid_regs regs;
75  cpuid(regs, 0x80000000);
76  return regs.eax;
77 }
78 
80 {
81  ret.vector_extensions = 0;
82 
83  cpuid_regs regs;
84  cpuid(regs, 1);
85 
86 #define XL_BIT(register, cpuid_bit, lift_bit) \
87  if (regs.register & x86_64::CPUID_BIT_ ##cpuid_bit) \
88  ret.vector_extensions |= x86_64::lift_bit
89 
90  XL_BIT(edx, EDX_SSE, SSE);
91  XL_BIT(edx, EDX_SSE2, SSE2);
92  XL_BIT(ecx, ECX_SSE3, SSE3);
93  XL_BIT(ecx, ECX_SSSE3, SSE3_S);
94  XL_BIT(ecx, ECX_SSE41, SSE4_1);
95  XL_BIT(ecx, ECX_SSE42, SSE4_2);
96  XL_BIT(ecx, ECX_FMA, SSE_FMA3);
97  XL_BIT(ecx, ECX_F16C, SSE_F16C);
98  XL_BIT(ecx, ECX_AVX, AVX);
99 
100  if (cpuid_max() < 7)
101  {
102  return;
103  }
104 
105  cpuid(regs, 7);
106 
107  XL_BIT(ebx, EBX_AVX2, AVX2);
108 
109  if (cpuid_max_extended() < 1)
110  {
111  return;
112  }
113 
114  cpuid(regs, 0x80000001);
115 
116  XL_BIT(ecx, ECX_SSE4A, SSE4_a);
117  XL_BIT(ecx, ECX_FMA4, SSE_FMA4);
118  XL_BIT(ecx, ECX_XOP, SSE_XOP);
119 
120 #undef XL_BIT
121 }
122 
123 static void decode_cache_descriptor(cpu_config& ret, uint8 desc)
124 {
125  switch(desc)
126  {
127 #define C(desc, level, type, total_size, associativity, line_size) \
128  case desc: \
129  ret.caches.push_back({cpu_cache::type, level, associativity, total_size * 1024, line_size}); \
130  break;
131 
132  C(0x06, 1, instruction, 8, 4, 32);
133  C(0x08, 1, instruction, 16, 4, 32);
134  C(0x09, 1, instruction, 32, 4, 64);
135  C(0x0a, 1, data, 8, 2, 32);
136  C(0x0c, 1, data, 16, 4, 32);
137  C(0x0d, 1, data, 16, 4, 64);
138  C(0x0e, 1, data, 24, 6, 64);
139  C(0x1d, 2, unified, 128, 2, 64);
140  C(0x21, 2, unified, 256, 8, 64);
141  C(0x22, 3, unified, 512, 4, 64);
142  C(0x23, 3, unified, 1024, 8, 64);
143  C(0x24, 2, unified, 1024, 16, 64);
144  C(0x25, 3, unified, 2048, 8, 64);
145  C(0x29, 3, unified, 4096, 8, 64);
146  C(0x2c, 1, data, 32, 8, 64);
147  C(0x30, 1, instruction, 32, 8, 64);
148  C(0x41, 2, unified, 128, 4, 32);
149  C(0x42, 2, unified, 256, 4, 32);
150  C(0x43, 2, unified, 512, 4, 32);
151  C(0x44, 2, unified, 1024, 4, 32);
152  C(0x45, 2, unified, 2048, 4, 32);
153  C(0x46, 3, unified, 4096, 4, 64);
154  C(0x47, 3, unified, 8192, 8, 64);
155  C(0x48, 2, unified, 3072, 12, 64);
156 
157  // thank you intel
158  case 0x49:
159  {
160  cpuid_regs regs;
161  cpuid(regs, 1);
162 
163  uint8 model = (regs.eax & 0xf0) >> 4;
164  uint8 family = (regs.eax & 0xf00) >> 8;
165 
166  if (family == 0x0f && model == 0x06)
167  {
168  ret.caches.push_back({cpu_cache::unified, 3, 16, 4096 * 1024, 64});
169  } else {
170  ret.caches.push_back({cpu_cache::unified, 2, 16, 4096 * 1024, 64});
171  }
172  }
173 
174  break;
175 
176  C(0x4a, 3, unified, 6144, 12, 64);
177  C(0x4b, 3, unified, 8192, 16, 64);
178  C(0x4c, 3, unified, 12288, 12, 64);
179  C(0x4d, 3, unified, 16 * 1024, 16, 64);
180  C(0x4e, 2, unified, 6 * 1024, 24, 64);
181  C(0x60, 1, data, 16, 8, 64);
182  C(0x66, 1, data, 8, 4, 64);
183  C(0x67, 1, data, 16, 4, 64);
184  C(0x68, 1, data, 32, 4, 64);
185  C(0x78, 2, unified, 1024, 4, 64);
186  C(0x79, 2, unified, 128, 8, 64);
187  C(0x7a, 2, unified, 256, 8, 64);
188  C(0x7b, 2, unified, 512, 8, 64);
189  C(0x7c, 2, unified, 1024, 8, 64);
190  C(0x7d, 2, unified, 2048, 8, 64);
191  C(0x7f, 2, unified, 512, 2, 64);
192  C(0x80, 2, unified, 512, 8, 64);
193  C(0x82, 2, unified, 256, 8, 32);
194  C(0x83, 2, unified, 512, 8, 32);
195  C(0x84, 2, unified, 1024, 8, 32);
196  C(0x85, 2, unified, 2048, 8, 32);
197  C(0x86, 2, unified, 512, 4, 64);
198  C(0x87, 2, unified, 1024, 8, 64);
199  C(0xd0, 3, unified, 512, 4, 64);
200  C(0xd1, 3, unified, 1024, 4, 64);
201  C(0xd2, 3, unified, 2048, 4, 64);
202  C(0xd6, 3, unified, 1024, 8, 64);
203  C(0xd7, 3, unified, 2048, 8, 64);
204  C(0xd8, 3, unified, 4096, 8, 64);
205  C(0xdc, 3, unified, 1536, 12, 64);
206  C(0xdd, 3, unified, 3072, 12, 64);
207  C(0xde, 3, unified, 6144, 12, 64);
208  C(0xe2, 3, unified, 2048, 16, 64);
209  C(0xe3, 3, unified, 4096, 16, 64);
210  C(0xe4, 3, unified, 8192, 16, 64);
211  C(0xea, 3, unified, 12288, 24, 64);
212  C(0xeb, 3, unified, 18432, 24, 64);
213  C(0xec, 3, unified, 24576, 24, 64);
214  }
215 }
216 
218 {
219  cpuid_regs regs;
220 
221  for(uint32 in_ecx = 0; ; in_ecx++)
222  {
223  cpuid(regs, 4, in_ecx);
224 
225  if ((regs.eax & 0xf) == 0)
226  break;
227 
229 
230  switch(regs.eax & 0xf)
231  {
232  case 1:
233  cache_type = cpu_cache::data;
234  break;
235 
236  case 2:
237  cache_type = cpu_cache::instruction;
238  break;
239 
240  case 3:
241  cache_type = cpu_cache::unified;
242  break;
243  }
244 
245  unsigned int level = ((regs.eax >> 5) & 0x3);
246 
247  unsigned int ways = ((regs.ebx >> 22) & 0xff) + 1;
248  unsigned int partitions = ((regs.ebx >> 12) & 0xff) + 1;
249  unsigned int line_size = (regs.ebx & 0x3ff) + 1;
250  unsigned int sets = regs.ecx + 1;
251 
252  unsigned int total_size = ways * partitions * line_size * sets;
253 
254  cpu_cache cache = { cache_type, level, ways, total_size, line_size };
255  ret.caches.push_back(cache);
256  }
257 }
258 
259 static void identify_caches(cpu_config& ret)
260 {
261  ret.caches.clear();
262 
263  cpuid_regs regs;
264  cpuid(regs, 2);
265 
266  if ((regs.eax & (1u << 31)) == 0)
267  {
268  decode_cache_descriptor(ret, (regs.eax >> 24) & 0xff);
269  decode_cache_descriptor(ret, (regs.eax >> 16) & 0xff);
270  decode_cache_descriptor(ret, (regs.eax >> 8) & 0xff);
271  }
272 
273  if ((regs.ebx & (1u << 31)) == 0)
274  {
275  decode_cache_descriptor(ret, (regs.ebx >> 24) & 0xff);
276  decode_cache_descriptor(ret, (regs.ebx >> 16) & 0xff);
277  decode_cache_descriptor(ret, (regs.ebx >> 8) & 0xff);
278  decode_cache_descriptor(ret, (regs.ebx >> 0) & 0xff);
279  }
280 
281  if ((regs.ecx & (1u << 31)) == 0)
282  {
283  decode_cache_descriptor(ret, (regs.ecx >> 24) & 0xff);
284  decode_cache_descriptor(ret, (regs.ecx >> 16) & 0xff);
285  decode_cache_descriptor(ret, (regs.ecx >> 8) & 0xff);
286  decode_cache_descriptor(ret, (regs.ecx >> 0) & 0xff);
287  }
288 
289  if ((regs.edx & (1u << 31)) == 0)
290  {
291  decode_cache_descriptor(ret, (regs.edx >> 24) & 0xff);
292  decode_cache_descriptor(ret, (regs.edx >> 16) & 0xff);
293  decode_cache_descriptor(ret, (regs.edx >> 8) & 0xff);
294  decode_cache_descriptor(ret, (regs.edx >> 0) & 0xff);
295  }
296 
298 }
299 
301 {
302  char brand_string[sizeof(unsigned int) * 4 * 4 + 1] = { 0 };
303 
304  cpuid_regs regs;
305 
306  cpuid(regs, 0x80000002);
307  memcpy(&brand_string[ 0], &regs.eax, sizeof(regs.eax));
308  memcpy(&brand_string[ 4], &regs.ebx, sizeof(regs.ebx));
309  memcpy(&brand_string[ 8], &regs.ecx, sizeof(regs.ecx));
310  memcpy(&brand_string[12], &regs.edx, sizeof(regs.edx));
311 
312  cpuid(regs, 0x80000003);
313  memcpy(&brand_string[16], &regs.eax, sizeof(regs.eax));
314  memcpy(&brand_string[20], &regs.ebx, sizeof(regs.ebx));
315  memcpy(&brand_string[24], &regs.ecx, sizeof(regs.ecx));
316  memcpy(&brand_string[28], &regs.edx, sizeof(regs.edx));
317 
318  cpuid(regs, 0x80000004);
319  memcpy(&brand_string[32], &regs.eax, sizeof(regs.eax));
320  memcpy(&brand_string[36], &regs.ebx, sizeof(regs.ebx));
321  memcpy(&brand_string[40], &regs.ecx, sizeof(regs.ecx));
322  memcpy(&brand_string[44], &regs.edx, sizeof(regs.edx));
323 
324  std::string name = brand_string;
325 
326  // trim leading/trailing whitespace
327  auto first_non_space = name.find_first_not_of(" ");
328  auto last_non_space = name.find_last_not_of(" ");
329 
330  ret.name = name.substr(first_non_space, last_non_space - first_non_space + 1);
331 }
332 
333 } // namespace x86
334 
335 namespace __internal {
336 
338 {
339  cpu_config ret;
340  unsigned int max_level;
341 
342  ret.num_threads = tbb::task_scheduler_init::default_num_threads();
343 
344  // get the maximum CPUID level supported
345  max_level = x86_64::cpuid_max();
346 
347  if (max_level < 1)
348  {
349  // should never happen
350  return ret;
351  }
352 
356 
357  return ret;
358 }
359 
360 } // namespace __internal
361 
362 } // namespace lift
static constexpr uint32 SSE4_1
Definition: vector_flags.h:42
static constexpr uint32 AVX2
Definition: vector_flags.h:50
static void identify_vector_extensions(cpu_config &ret)
Definition: x86_64_cpuid.cu:79
static unsigned int cpuid_max_extended(void)
Definition: x86_64_cpuid.cu:72
static constexpr uint32 AVX
Definition: vector_flags.h:49
uint32_t uint32
Definition: types.h:43
cpu_config identify_host_cpu(void)
static constexpr uint32 SSE_FMA4
Definition: vector_flags.h:46
#define XL_BIT(register, cpuid_bit, lift_bit)
static void scan_leaf4_cache_info(cpu_config &ret)
std::vector< cpu_cache > caches
static constexpr uint32 SSE
Definition: vector_flags.h:38
static constexpr uint32 SSE3_S
Definition: vector_flags.h:41
static void cpuid(cpuid_regs &output, unsigned int code, unsigned int count=0)
Definition: x86_64_cpuid.cu:52
static constexpr uint32 SSE4_2
Definition: vector_flags.h:43
static void decode_cache_descriptor(cpu_config &ret, uint8 desc)
static constexpr uint32 SSE_FMA3
Definition: vector_flags.h:47
static void identify_caches(cpu_config &ret)
static void get_cpu_brand_string(cpu_config &ret)
static constexpr uint32 SSE2
Definition: vector_flags.h:39
static constexpr uint32 SSE_XOP
Definition: vector_flags.h:45
uint8_t uint8
Definition: types.h:39
static constexpr uint32 SSE4_a
Definition: vector_flags.h:44
static constexpr uint32 SSE_F16C
Definition: vector_flags.h:48
#define C(desc, level, type, total_size, associativity, line_size)
static constexpr uint32 SSE3
Definition: vector_flags.h:40
static unsigned int cpuid_max(unsigned int extended=0)
Definition: x86_64_cpuid.cu:65