Rev 154 | Go to most recent revision | Show entire file | Ignore whitespace | Details | Blame | Last modification | View Log | RSS feed
Rev 154 | Rev 169 | ||
---|---|---|---|
Line 4... | Line 4... | ||
4 | Copyright (C) 2008-2015 Marco Costalba, Joona Kiiski, Tord Romstad |
4 | Copyright (C) 2008-2015 Marco Costalba, Joona Kiiski, Tord Romstad |
5 | Copyright (C) 2015- |
5 | Copyright (C) 2015-2018 Marco Costalba, Joona Kiiski, Gary Linscott, Tord Romstad |
6 | 6 | ||
7 | Stockfish is free software: you can redistribute it and/or modify |
7 | Stockfish is free software: you can redistribute it and/or modify |
8 | it under the terms of the GNU General Public License as published by |
8 | it under the terms of the GNU General Public License as published by |
9 | the Free Software Foundation, either version 3 of the License, or |
9 | the Free Software Foundation, either version 3 of the License, or |
10 | (at your option) any later version. |
10 | (at your option) any later version. |
Line 15... | Line 15... | ||
15 | GNU General Public License for more details. |
15 | GNU General Public License for more details. |
16 | 16 | ||
17 | You should have received a copy of the GNU General Public License |
17 | You should have received a copy of the GNU General Public License |
18 | along with this program. If not, see <http://www.gnu.org/licenses/>. |
18 | along with this program. If not, see <http://www.gnu.org/licenses/>. |
19 | */ |
19 | */ |
- | 20 | ||
- | 21 | #ifdef _WIN32 |
|
- | 22 | #if _WIN32_WINNT < 0x0601 |
|
- | 23 | #undef _WIN32_WINNT |
|
- | 24 | #define _WIN32_WINNT 0x0601 // Force to include needed API prototypes |
|
- | 25 | #endif |
|
- | 26 | #include <windows.h> |
|
- | 27 | // The needed Windows API for processor groups could be missed from old Windows |
|
- | 28 | // versions, so instead of calling them directly (forcing the linker to resolve |
|
- | 29 | // the calls at compile time), try to load them at runtime. To do this we need |
|
- | 30 | // first to define the corresponding function pointers. |
|
- | 31 | extern "C" { |
|
- | 32 | typedef bool(*fun1_t)(LOGICAL_PROCESSOR_RELATIONSHIP, |
|
- | 33 | PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX, PDWORD); |
|
- | 34 | typedef bool(*fun2_t)(USHORT, PGROUP_AFFINITY); |
|
- | 35 | typedef bool(*fun3_t)(HANDLE, CONST GROUP_AFFINITY*, PGROUP_AFFINITY); |
|
- | 36 | } |
|
- | 37 | #endif |
|
20 | 38 | ||
21 | #include <fstream> |
39 | #include <fstream> |
22 | #include <iomanip> |
40 | #include <iomanip> |
23 | #include <iostream> |
41 | #include <iostream> |
24 | #include <sstream> |
42 | #include <sstream> |
- | 43 | #include <vector> |
|
25 | 44 | ||
26 | #include "misc.h" |
45 | #include "misc.h" |
27 | #include "thread.h" |
46 | #include "thread.h" |
28 | 47 | ||
29 | using namespace std; |
48 | using namespace std; |
30 | 49 | ||
31 | namespace { |
50 | namespace { |
32 | 51 | ||
33 | /// Version number. If Version is left empty, then compile date in the format |
52 | /// Version number. If Version is left empty, then compile date in the format |
34 | /// DD-MM-YY and show in engine_info. |
53 | /// DD-MM-YY and show in engine_info. |
35 | const string Version = " |
54 | const string Version = "9"; |
36 | 55 | ||
37 | /// Our fancy logging facility. The trick here is to replace cin.rdbuf() and |
56 | /// Our fancy logging facility. The trick here is to replace cin.rdbuf() and |
38 | /// cout.rdbuf() with two Tie objects that tie cin and cout to a file stream. We |
57 | /// cout.rdbuf() with two Tie objects that tie cin and cout to a file stream. We |
39 | /// can toggle the logging of std::cout and std:cin at runtime whilst preserving |
58 | /// can toggle the logging of std::cout and std:cin at runtime whilst preserving |
40 | /// usual I/O functionality, all without changing a single line of code! |
59 | /// usual I/O functionality, all without changing a single line of code! |
41 | /// Idea from http://groups.google.com/group/comp.lang.c++/msg/1d941c0f26ea0d81 |
60 | /// Idea from http://groups.google.com/group/comp.lang.c++/msg/1d941c0f26ea0d81 |
42 | 61 | ||
43 | struct Tie: public streambuf { // MSVC requires split streambuf for cin and cout |
62 | struct Tie: public streambuf { // MSVC requires split streambuf for cin and cout |
44 | 63 | ||
45 | Tie(streambuf* b, streambuf* l) : buf(b), logBuf(l) {} |
64 | Tie(streambuf* b, streambuf* l) : buf(b), logBuf(l) {} |
46 | 65 | ||
47 | int sync() { return logBuf->pubsync(), buf->pubsync(); } |
66 | int sync() override { return logBuf->pubsync(), buf->pubsync(); } |
48 | int overflow(int c) { return log(buf->sputc((char)c), "<< "); } |
67 | int overflow(int c) override { return log(buf->sputc((char)c), "<< "); } |
49 | int underflow() { return buf->sgetc(); } |
68 | int underflow() override { return buf->sgetc(); } |
50 | int uflow() { return log(buf->sbumpc(), ">> "); } |
69 | int uflow() override { return log(buf->sbumpc(), ">> "); } |
51 | 70 | ||
52 | streambuf *buf, *logBuf; |
71 | streambuf *buf, *logBuf; |
53 | 72 | ||
54 | int log(int c, const char* prefix) { |
73 | int log(int c, const char* prefix) { |
55 | 74 | ||
56 | static int last = '\n'; // Single log file |
75 | static int last = '\n'; // Single log file |
57 | 76 | ||
58 | if (last == '\n') |
77 | if (last == '\n') |
59 | logBuf->sputn(prefix, 3); |
78 | logBuf->sputn(prefix, 3); |
60 | 79 | ||
61 | return last = logBuf->sputc((char)c); |
80 | return last = logBuf->sputc((char)c); |
62 | } |
81 | } |
63 | }; |
82 | }; |
64 | 83 | ||
65 | class Logger { |
84 | class Logger { |
66 | 85 | ||
67 | Logger() : in(cin.rdbuf(), file.rdbuf()), out(cout.rdbuf(), file.rdbuf()) {} |
86 | Logger() : in(cin.rdbuf(), file.rdbuf()), out(cout.rdbuf(), file.rdbuf()) {} |
68 | ~Logger() { start(""); } |
87 | ~Logger() { start(""); } |
69 | 88 | ||
70 | ofstream file; |
89 | ofstream file; |
71 | Tie in, out; |
90 | Tie in, out; |
Line 162... | Line 181... | ||
162 | 181 | ||
163 | /// prefetch() preloads the given address in L1/L2 cache. This is a non-blocking |
182 | /// prefetch() preloads the given address in L1/L2 cache. This is a non-blocking |
164 | /// function that doesn't stall the CPU waiting for data to be loaded from memory, |
183 | /// function that doesn't stall the CPU waiting for data to be loaded from memory, |
165 | /// which can be quite slow. |
184 | /// which can be quite slow. |
166 | #ifdef NO_PREFETCH |
185 | #ifdef NO_PREFETCH |
167 | 186 | ||
168 | void prefetch(void*) {} |
187 | void prefetch(void*) {} |
169 | 188 | ||
170 | #else |
189 | #else |
171 | 190 | ||
172 | void prefetch(void* addr) { |
191 | void prefetch(void* addr) { |
173 | 192 | ||
174 | # if defined(__INTEL_COMPILER) |
193 | # if defined(__INTEL_COMPILER) |
175 | // This hack prevents prefetches from being optimized away by |
194 | // This hack prevents prefetches from being optimized away by |
176 | // Intel compiler. Both MSVC and gcc seem not be affected by this. |
195 | // Intel compiler. Both MSVC and gcc seem not be affected by this. |
177 | __asm__ (""); |
196 | __asm__ (""); |
178 | # endif |
197 | # endif |
Line 183... | Line 202... | ||
183 | __builtin_prefetch(addr); |
202 | __builtin_prefetch(addr); |
184 | # endif |
203 | # endif |
185 | } |
204 | } |
186 | 205 | ||
187 | #endif |
206 | #endif |
- | 207 | ||
- | 208 | void prefetch2(void* addr) { |
|
- | 209 | ||
- | 210 | prefetch(addr); |
|
- | 211 | prefetch((uint8_t*)addr + 64); |
|
- | 212 | } |
|
- | 213 | ||
- | 214 | namespace WinProcGroup { |
|
- | 215 | ||
- | 216 | #ifndef _WIN32 |
|
- | 217 | ||
- | 218 | void bindThisThread(size_t) {} |
|
- | 219 | ||
- | 220 | #else |
|
- | 221 | ||
- | 222 | /// get_group() retrieves logical processor information using Windows specific |
|
- | 223 | /// API and returns the best group id for the thread with index idx. Original |
|
- | 224 | /// code from Texel by Peter Ă–sterlund. |
|
- | 225 | ||
- | 226 | int get_group(size_t idx) { |
|
- | 227 | ||
- | 228 | int threads = 0; |
|
- | 229 | int nodes = 0; |
|
- | 230 | int cores = 0; |
|
- | 231 | DWORD returnLength = 0; |
|
- | 232 | DWORD byteOffset = 0; |
|
- | 233 | ||
- | 234 | // Early exit if the needed API is not available at runtime |
|
- | 235 | HMODULE k32 = GetModuleHandle("Kernel32.dll"); |
|
- | 236 | auto fun1 = (fun1_t)GetProcAddress(k32, "GetLogicalProcessorInformationEx"); |
|
- | 237 | if (!fun1) |
|
- | 238 | return -1; |
|
- | 239 | ||
- | 240 | // First call to get returnLength. We expect it to fail due to null buffer |
|
- | 241 | if (fun1(RelationAll, nullptr, &returnLength)) |
|
- | 242 | return -1; |
|
- | 243 | ||
- | 244 | // Once we know returnLength, allocate the buffer |
|
- | 245 | SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX *buffer, *ptr; |
|
- | 246 | ptr = buffer = (SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX*)malloc(returnLength); |
|
- | 247 | ||
- | 248 | // Second call, now we expect to succeed |
|
- | 249 | if (!fun1(RelationAll, buffer, &returnLength)) |
|
- | 250 | { |
|
- | 251 | free(buffer); |
|
- | 252 | return -1; |
|
- | 253 | } |
|
- | 254 | ||
- | 255 | while (ptr->Size > 0 && byteOffset + ptr->Size <= returnLength) |
|
- | 256 | { |
|
- | 257 | if (ptr->Relationship == RelationNumaNode) |
|
- | 258 | nodes++; |
|
- | 259 | ||
- | 260 | else if (ptr->Relationship == RelationProcessorCore) |
|
- | 261 | { |
|
- | 262 | cores++; |
|
- | 263 | threads += (ptr->Processor.Flags == LTP_PC_SMT) ? 2 : 1; |
|
- | 264 | } |
|
- | 265 | ||
- | 266 | byteOffset += ptr->Size; |
|
- | 267 | ptr = (SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX*)(((char*)ptr) + ptr->Size); |
|
- | 268 | } |
|
- | 269 | ||
- | 270 | free(buffer); |
|
- | 271 | ||
- | 272 | std::vector<int> groups; |
|
- | 273 | ||
- | 274 | // Run as many threads as possible on the same node until core limit is |
|
- | 275 | // reached, then move on filling the next node. |
|
- | 276 | for (int n = 0; n < nodes; n++) |
|
- | 277 | for (int i = 0; i < cores / nodes; i++) |
|
- | 278 | groups.push_back(n); |
|
- | 279 | ||
- | 280 | // In case a core has more than one logical processor (we assume 2) and we |
|
- | 281 | // have still threads to allocate, then spread them evenly across available |
|
- | 282 | // nodes. |
|
- | 283 | for (int t = 0; t < threads - cores; t++) |
|
- | 284 | groups.push_back(t % nodes); |
|
- | 285 | ||
- | 286 | // If we still have more threads than the total number of logical processors |
|
- | 287 | // then return -1 and let the OS to decide what to do. |
|
- | 288 | return idx < groups.size() ? groups[idx] : -1; |
|
- | 289 | } |
|
- | 290 | ||
- | 291 | ||
- | 292 | /// bindThisThread() set the group affinity of the current thread |
|
- | 293 | ||
- | 294 | void bindThisThread(size_t idx) { |
|
- | 295 | ||
- | 296 | // Use only local variables to be thread-safe |
|
- | 297 | int group = get_group(idx); |
|
- | 298 | ||
- | 299 | if (group == -1) |
|
- | 300 | return; |
|
- | 301 | ||
- | 302 | // Early exit if the needed API are not available at runtime |
|
- | 303 | HMODULE k32 = GetModuleHandle("Kernel32.dll"); |
|
- | 304 | auto fun2 = (fun2_t)GetProcAddress(k32, "GetNumaNodeProcessorMaskEx"); |
|
- | 305 | auto fun3 = (fun3_t)GetProcAddress(k32, "SetThreadGroupAffinity"); |
|
- | 306 | ||
- | 307 | if (!fun2 || !fun3) |
|
- | 308 | return; |
|
- | 309 | ||
- | 310 | GROUP_AFFINITY affinity; |
|
- | 311 | if (fun2(group, &affinity)) |
|
- | 312 | fun3(GetCurrentThread(), &affinity, nullptr); |
|
- | 313 | } |
|
- | 314 | ||
- | 315 | #endif |
|
- | 316 | ||
- | 317 | } // namespace WinProcGroup |