Rev 108 | Go to most recent revision | Details | Compare with Previous | Last modification | View Log | RSS feed
Rev | Author | Line No. | Line |
---|---|---|---|
33 | pmbaty | 1 | #include "chess.h" |
2 | #include "data.h" |
||
3 | #include "epdglue.h" |
||
154 | pmbaty | 4 | /* modified 08/03/16 */ |
33 | pmbaty | 5 | /* |
6 | ******************************************************************************* |
||
7 | * * |
||
108 | pmbaty | 8 | * Split() is the driver for the threaded parallel search in Crafty. The * |
33 | pmbaty | 9 | * basic idea is that whenever we notice that one (or more) threads are in * |
108 | pmbaty | 10 | * their idle loop, we drop into Split(), from Search(), and begin a new * |
11 | * parallel search from this node. This is simply a problem of establishing * |
||
12 | * a new split point, and then letting each thread join this split point and * |
||
13 | * copy whatever data they need. * |
||
33 | pmbaty | 14 | * * |
108 | pmbaty | 15 | * This is generation II of Split(). The primary differences address two * |
16 | * specific performance-robbing issues. (1) Excessive waiting for a split * |
||
17 | * to be done, and (b) excessive waiting on specific locks. Generation II * |
||
18 | * addresses both of these to significantly improve performance. * |
||
19 | * * |
||
20 | * The main difference between Gen I and Gen II is the effort required to * |
||
21 | * split the tree and which thread(s) expend this effort. In generation I, * |
||
22 | * the parent thread was responsible for allocating a split block for each * |
||
23 | * child thread, and then copying the necessary data from the parent split * |
||
24 | * block to these child split blocks. When all of this was completed, the * |
||
25 | * child processes were released to start the parallel search after being * |
||
26 | * held while the split / copy operations were done. In the generation II * |
||
27 | * Split() we now simply allocate a new split block for THIS thread, flag * |
||
28 | * the parent split block as joinable, and then go directly to ThreadWait() * |
||
29 | * which will drop us back in to the search immediately. The idle threads * |
||
30 | * are continually looping on Join() which will jump them right into this * |
||
31 | * split block letting them do ALL the work of allocating a split block, * |
||
32 | * filling it in, and then copying the data to their local split block. * |
||
33 | * This distributes the split overhead among all the threads that split, * |
||
34 | * rather than this thread having to do all the work while the other threads * |
||
35 | * sit idle. * |
||
36 | * * |
||
37 | * Generation II is also much more lightweight, in that it copies only the * |
||
38 | * bare minimum from parent to child. Generation I safely copied far too * |
||
39 | * much since this code was being changed regularly, but that is no longer * |
||
40 | * necessary overhead. * |
||
41 | * * |
||
42 | * Generation II has a zero contention split algorithm. In the past, when a * |
||
43 | * thread became idle, it posted a global split request and any thread that * |
||
44 | * was at an appropriate point would try to split. But while it was doing * |
||
45 | * the splitting, other threads that were also willing to split would "get * |
||
46 | * in line" because Crafty used a global lock to prevent two threads from * |
||
47 | * attempting to split at the same instant in time. They got in line, and * |
||
48 | * waited for the original splitter to release the lock, but now they have * |
||
49 | * no idle threads to split with. A waste of time. Now we allow ANY thread * |
||
50 | * to attempt to split at the current ply. When we do what might be called * |
||
51 | * a "gratuitous split" the only restriction is that if we have existing * |
||
52 | * "gratuitous split points" (split blocks that are joinable but have not * |
||
53 | * yet been joined), then we limit the number of such splits (per thread) to * |
||
54 | * avoid excessive overhead. * |
||
55 | * * |
||
56 | * Generation II takes another idea from DTS, the idea of "late-join". The * |
||
57 | * idea is fairly simple. If, when a thread becomes idle, there are already * |
||
58 | * other split points being searched in parallel, then we will try to join * |
||
59 | * one of them rather than waiting for someone to ask us to help. We use * |
||
60 | * some simple criteria: (1) The split point must be joinable, which simply * |
||
61 | * means that no processor has exited the split point yet (which would let * |
||
62 | * us know there is no more work here and a join would be futile); (2) We * |
||
63 | * compute an "interest" value which is a simple formula based on depth at * |
||
64 | * the split point, and the number of moves already searched. It seems less * |
||
65 | * risky to choose a split point with max depth AND minimum moves already * |
||
66 | * searched so that there is plenty to do. This was quite simple to add * |
||
67 | * after the rest of the Generation II rewrite. In fact, this is now THE * |
||
68 | * way threads join a split point, period, which further simplifies the code * |
||
69 | * and improves efficiency. IE, a thread can split when idle threads are * |
||
70 | * noticed, or if it is far enough away from the tips to make the cost * |
||
71 | * negligible. At that point any idle thread(s) can join immediately, those * |
||
72 | * that become idle later can join when they are ready. * |
||
73 | * * |
||
33 | pmbaty | 74 | * There are a number of settable options via the command-line or .craftyrc * |
75 | * initialization file. Here's a concise explanation for each option and an * |
||
76 | * occasional suggestion for testing/tuning. * |
||
77 | * * |
||
154 | pmbaty | 78 | * smp_affinity (command = smpaffinity=<n> <p> is used to enable or disable * |
108 | pmbaty | 79 | * processor affinity. -1 disables affinity and lets threads run on any * |
80 | * available core. If you use an integer <n> then thread zero will bind * |
||
81 | * itself to cpu <n> and each additional thread will bind to the next * |
||
82 | * higher cpu number. This is useful if you try to run two copies of * |
||
83 | * crafty on the same machine, now you can cause one to bind to the first * |
||
84 | * <n> cores, and the second to the last <n> cores. For the first * |
||
85 | * instance of Crafty, you would use smpaffinity=0, and for the second * |
||
86 | * smpaffinity=8, assuming you are running 8 threads per copy on a 16 cpu * |
||
87 | * machine. If you get this wrong, you can have more than one thread on * |
||
88 | * the same cpu which will significantly impact performance. * |
||
89 | * * |
||
33 | pmbaty | 90 | * smp_max_threads (command = smpmt=n) sets the total number of allowable * |
91 | * threads for the search. The default is one (1) as Crafty does not * |
||
92 | * assume it should use all available resources. For optimal performance * |
||
93 | * this should be set to the number of physical cores your machine has, * |
||
94 | * which does NOT include hyperthreaded cores. * |
||
95 | * * |
||
96 | * smp_split_group (command = smpgroup=n) sets the maximum number of threads * |
||
108 | pmbaty | 97 | * at any single split point, with the exception of split points fairly * |
98 | * close to the root where ALL threads are allowed to split together, * |
||
99 | * ignoring this limit. Note that this is ignored in the first 1/2 of * |
||
100 | * the tree (the nodes closer to the root). There it is actually good to * |
||
101 | * split and get all active threads involved. * |
||
33 | pmbaty | 102 | * * |
103 | * smp_min_split_depth (command = smpmin=n) avoids splitting when remaining * |
||
104 | * depth < n. This is used to balance splitting overhead cost against * |
||
108 | pmbaty | 105 | * the speed gain the parallel search produces. The default is currently * |
33 | pmbaty | 106 | * 5 (which could change with future generations of Intel hardware) but * |
107 | * values between 4 and 8 will work. Larger values allow somewhat fewer * |
||
108 | * splits, which reduces overhead, but it also increases the percentage * |
||
109 | * of the time where a thread is waiting on work. * |
||
110 | * * |
||
108 | pmbaty | 111 | * smp_split_at_root (command = smproot=0 or 1) enables (1) or disables (0) * |
33 | pmbaty | 112 | * splitting the tree at the root. This defaults to 1 which produces the * |
113 | * best performance by a signficiant margin. But it can be disabled if * |
||
114 | * you are playing with code changes. * |
||
115 | * * |
||
108 | pmbaty | 116 | * smp_gratuitous_depth (command = smpgd=<n>) controls " gratuitous splits" * |
117 | * which are splits that are done without any idle threads. This sets a * |
||
118 | * depth limit (remaining depth) that must be present before such a split * |
||
119 | * can be done. Making this number larger will reduce the number of * |
||
120 | * these splits. Making it too small will increase overhead slightly and * |
||
121 | * increase split block usage significantly. * |
||
33 | pmbaty | 122 | * * |
108 | pmbaty | 123 | * smp_gratuitous_limit (command = smpgl=<n>) limits the number of these * |
124 | * splits that a thread can do. Once a thread has this number of * |
||
125 | * unjoined split points, it will not be allowed to split any more until * |
||
126 | * one or more threads join at least one of the existing split points. * |
||
127 | * In the smp search statistics, where you see output that looks like * |
||
128 | * this: * |
||
129 | * * |
||
130 | * splits=m(n) ... * |
||
131 | * * |
||
132 | * m is the total splits done, n is the number of "wasted splits" which * |
||
133 | * are basically gratuitous splits where no thread joined before this * |
||
134 | * split point was completed and deallocated. * |
||
135 | * * |
||
136 | * The best way to tune all of these paramenters is to use the "autotune" * |
||
137 | * command (see autotune.c and help autotune) which will automatically run * |
||
138 | * tests and optimize the parameters. More details are in the autotune.c * |
||
139 | * source file. * |
||
140 | * * |
||
33 | pmbaty | 141 | * A few basic "rules of the road" for anyone interested in changing or * |
142 | * adding to any of this code. * |
||
143 | * * |
||
144 | * 1. If, at any time, you want to modify your private split block, no lock * |
||
145 | * is required. * |
||
146 | * * |
||
147 | * 2. If, at any time, you want to modify another split block, such as the * |
||
148 | * parent split block shared move list, you must acquire the lock in the * |
||
108 | pmbaty | 149 | * split block first. IE tree->parent->lock to lock the parent split * |
150 | * block during NextMove() and such. * |
||
33 | pmbaty | 151 | * * |
108 | pmbaty | 152 | * 3. If you want to modify any SMP-related data that spans multiple split * |
153 | * blocks, such as telling sibling processes to stop, etc, then you must * |
||
154 | * acquire the global "lock_smp" lock first. This prevents a deadlock * |
||
155 | * caused by two different threads walking the split block chain from * |
||
156 | * different directions, and acquiring the split block locks in * |
||
157 | * different orders, which could cause a catastrophic deadlock to occur. * |
||
158 | * This is an infrequent event so the overhead is not significant. * |
||
33 | pmbaty | 159 | * * |
160 | * 4. If you want to do any sort of I/O operation, you must acquire the * |
||
161 | * "lock_io" lock first. Since threads share descriptors, there are * |
||
162 | * lots of potential race conditions, from the simple tty-output getting * |
||
163 | * interlaced from different threads, to outright data corruption in the * |
||
164 | * book or log files. * |
||
165 | * * |
||
166 | * Some of the bugs caused by failing to acquire the correct lock will only * |
||
167 | * occur infrequently, and they are extremely difficult to find. Some only * |
||
168 | * show up in a public game where everyone is watching, to cause maximum * |
||
169 | * embarassment and causes the program to do something extremely stupid. * |
||
170 | * * |
||
171 | ******************************************************************************* |
||
172 | */ |
||
108 | pmbaty | 173 | int Split(TREE * RESTRICT tree) { |
174 | TREE *child; |
||
175 | int tid, tstart, tend; |
||
33 | pmbaty | 176 | |
177 | /* |
||
178 | ************************************************************ |
||
179 | * * |
||
108 | pmbaty | 180 | * Here we prepare to split the tree. All we really do in * |
181 | * the Generation II threading is grab a split block for * |
||
182 | * this thread, then flag the parent as "joinable" and * |
||
183 | * then jump right to ThreadWait() to resume where we left * |
||
184 | * off, with the expectation (but not a requirement) that * |
||
185 | * other threads will join us to help. * |
||
33 | pmbaty | 186 | * * |
108 | pmbaty | 187 | * Idle threads are sitting in ThreadWait() repeatedly * |
188 | * calling Join() to find them a split point, which we are * |
||
189 | * fixing to provide. They will then join as quickly as * |
||
190 | * they can, and other threads that become idle later can * |
||
191 | * also join without any further splitting needed. * |
||
192 | * * |
||
193 | * If we are unable to allocate a split block, we simply * |
||
194 | * abort this attempted split and return to the search * |
||
195 | * since other threads will also split quickly. * |
||
196 | * * |
||
33 | pmbaty | 197 | ************************************************************ |
198 | */ |
||
108 | pmbaty | 199 | tstart = ReadClock(); |
200 | tree->nprocs = 0; |
||
154 | pmbaty | 201 | for (tid = 0; tid < smp_max_threads; tid++) |
108 | pmbaty | 202 | tree->siblings[tid] = 0; |
203 | child = GetBlock(tree, tree->thread_id); |
||
204 | if (!child) |
||
33 | pmbaty | 205 | return 0; |
108 | pmbaty | 206 | CopyFromParent(child); |
207 | thread[tree->thread_id].tree = child; |
||
208 | tree->joined = 0; |
||
209 | tree->joinable = 1; |
||
210 | parallel_splits++; |
||
211 | smp_split = 0; |
||
212 | tend = ReadClock(); |
||
213 | thread[tree->thread_id].idle += tend - tstart; |
||
33 | pmbaty | 214 | /* |
215 | ************************************************************ |
||
216 | * * |
||
108 | pmbaty | 217 | * We have successfully created a split point, which means * |
218 | * we are done. The instant we set the "joinable" flag, * |
||
219 | * idle threads may begin to join in at this split point * |
||
220 | * to help. Since this thread may finish before any or * |
||
221 | * all of the other parallel threads, this thread is sent * |
||
222 | * to ThreadWait() which will immediately send it to * |
||
223 | * SearchMoveList() like the other threads; however, going * |
||
224 | * to ThreadWait() allows this thread to join others if it * |
||
225 | * runs out of work to do. We do pass ThreadWait() the * |
||
226 | * address of the parent split block, so that if this * |
||
227 | * thread becomes idle, and this thread block shows no * |
||
228 | * threads are still busy, then this thread can return to * |
||
229 | * here and then back up into the previous ply as it * |
||
230 | * should. Note that no other thread can back up to the * |
||
231 | * previous ply since their recursive call stacks are not * |
||
232 | * set for that, while this call stack will bring us back * |
||
233 | * to this point where we return to the normal search, * |
||
234 | * which we just completed. * |
||
33 | pmbaty | 235 | * * |
108 | pmbaty | 236 | ************************************************************ |
237 | */ |
||
238 | ThreadWait(tree->thread_id, tree); |
||
239 | if (!tree->joined) |
||
240 | parallel_splits_wasted++; |
||
241 | return 1; |
||
242 | } |
||
243 | |||
154 | pmbaty | 244 | /* modified 08/03/16 */ |
108 | pmbaty | 245 | /* |
246 | ******************************************************************************* |
||
247 | * * |
||
248 | * Join() is called just when we enter the usual spin-loop waiting for work. * |
||
249 | * We take a quick look at all active split blocks to see if any look * |
||
250 | * "joinable". If so, we compute an "interest" value, which will be defined * |
||
251 | * below. We then join the most interesting split point directly. This * |
||
252 | * split point might have been created specifically for this thread to join, * |
||
253 | * or it might be one that was already active when this thread became idle, * |
||
254 | * which allows us to join that existing split point and not request a new * |
||
255 | * split operation, saving time. * |
||
256 | * * |
||
257 | ******************************************************************************* |
||
258 | */ |
||
259 | int Join(int64_t tid) { |
||
260 | TREE *tree, *join_block, *child; |
||
261 | int interest, best_interest, current, pass = 0; |
||
262 | |||
263 | /* |
||
264 | ************************************************************ |
||
33 | pmbaty | 265 | * * |
108 | pmbaty | 266 | * First we pass over ALL split blocks, looking for those * |
267 | * flagged as "joinable" (which means they are actually * |
||
268 | * active split points and that no processor at that split * |
||
269 | * point has run out of work (there is no point in joining * |
||
270 | * a split point with no remaining work) and no fail high * |
||
271 | * has been found which would raise the "stop" flag.) This * |
||
272 | * is "racy" because we do not acquire any locks, which * |
||
273 | * means that the busy threads continue working, and there * |
||
274 | * is a small probability that the split point will * |
||
275 | * disappear while we are in this loop. To resolve the * |
||
276 | * potential race, after we find the most attractive split * |
||
277 | * point, we acquire the lock for that split block and * |
||
278 | * test again, but this time if the block is joinable, we * |
||
279 | * can safely join under control of the lock, which is not * |
||
280 | * held for very long at all. If the block is not * |
||
281 | * joinable once we acquire the lock, we abort joining * |
||
282 | * since it is futile. Note that if this happens, we will * |
||
283 | * try to find an existing split point we can join three * |
||
284 | * times before we exit, setting split to 1 to ask other * |
||
285 | * threads to produce more candidate split points. * |
||
33 | pmbaty | 286 | * * |
154 | pmbaty | 287 | * Special case: We don't want to join a split point that * |
288 | * was created by this thread. While it works, it can add * |
||
289 | * overhead since we can encounter a later split point * |
||
290 | * that originated at the current split point, and we * |
||
291 | * would continue searching even though most of the work * |
||
292 | * has already been completed. The hash table would help * |
||
293 | * avoid most (if not all) of this overhead, but there is * |
||
294 | * no good reason to take the chance of this happening. * |
||
295 | * * |
||
108 | pmbaty | 296 | ************************************************************ |
297 | */ |
||
298 | for (pass = 0; pass < 3; pass++) { |
||
299 | best_interest = -999999; |
||
300 | join_block = 0; |
||
154 | pmbaty | 301 | for (current = 0; current <= smp_max_threads * 64; current++) { |
108 | pmbaty | 302 | tree = block[current]; |
303 | if (tree->joinable && (tree->ply <= tree->depth / 2 || |
||
154 | pmbaty | 304 | tree->nprocs < smp_split_group) && tree->thread_id != tid) { |
108 | pmbaty | 305 | interest = tree->depth * 2 - tree->searched[0]; |
306 | if (interest > best_interest) { |
||
307 | best_interest = interest; |
||
308 | join_block = tree; |
||
309 | } |
||
310 | } |
||
311 | } |
||
312 | /* |
||
313 | ************************************************************ |
||
33 | pmbaty | 314 | * * |
108 | pmbaty | 315 | * Now we acquire the lock for this split block, and then * |
316 | * check to see if the block is still flagged as joinable. * |
||
317 | * If so, we set things up, and then we get pretty tricky * |
||
318 | * as we then release the lock, and then copy the data * |
||
319 | * from the parent to our split block. There is a chance * |
||
320 | * that while we are copying this data, the split point * |
||
321 | * gets completed by other threads. Which would leave an * |
||
322 | * apparent race condition exposed where we start copying * |
||
323 | * data here, the split point is completed, the parent * |
||
324 | * block is released and then reacquired and we continue * |
||
325 | * if nothing has happened here, getting data copied from * |
||
326 | * two different positions. * |
||
327 | * * |
||
328 | * Fortunately, we linked this new split block to the old * |
||
329 | * (original parent). If that split block is released, we * |
||
330 | * will discover this because that operation will also set * |
||
331 | * our "stop" flag which will prevent us from using this * |
||
332 | * data and breaking things. We allow threads to copy * |
||
333 | * this data without any lock protection to eliminate a * |
||
334 | * serialization (each node would copy the data serially, * |
||
335 | * rather than all at once) with the only consequence to * |
||
336 | * this being the overhead of copying and then throwing * |
||
337 | * the data away, which can happen on occasion even if we * |
||
338 | * used a lock for protection, since once we release the * |
||
339 | * lock it still takes time to get into the search and we * |
||
340 | * could STILL find that this split block has already been * |
||
341 | * completed, once again. Less contention and serial * |
||
342 | * computing improves performance. * |
||
343 | * * |
||
33 | pmbaty | 344 | ************************************************************ |
345 | */ |
||
108 | pmbaty | 346 | if (join_block) { |
347 | Lock(join_block->lock); |
||
348 | if (join_block->joinable) { |
||
154 | pmbaty | 349 | child = GetBlock(join_block, tid); |
108 | pmbaty | 350 | Unlock(join_block->lock); |
351 | if (child) { |
||
352 | CopyFromParent(child); |
||
353 | thread[tid].tree = child; |
||
354 | parallel_joins++; |
||
355 | return 1; |
||
356 | } |
||
357 | } else { |
||
358 | Unlock(join_block->lock); |
||
33 | pmbaty | 359 | break; |
108 | pmbaty | 360 | } |
33 | pmbaty | 361 | } |
362 | } |
||
363 | /* |
||
364 | ************************************************************ |
||
365 | * * |
||
108 | pmbaty | 366 | * We did not acquire a split point to join, so we set * |
367 | * smp_split to 1 to ask busy threads to create joinable * |
||
368 | * split points. * |
||
33 | pmbaty | 369 | * * |
370 | ************************************************************ |
||
371 | */ |
||
108 | pmbaty | 372 | smp_split = 1; |
373 | return 0; |
||
374 | } |
||
375 | |||
154 | pmbaty | 376 | /* modified 08/03/16 */ |
33 | pmbaty | 377 | /* |
108 | pmbaty | 378 | ******************************************************************************* |
379 | * * |
||
154 | pmbaty | 380 | * ThreadAffinity() is called to "pin" a thread to a specific processor. It * |
381 | * is a "noop" (no-operation) if Crafty was not compiled with -DAFFINITY, or * |
||
382 | * if smp_affinity is negative (smpaffinity=-1 disables affinity). It * |
||
383 | * simply sets the affinity for the current thread to the requested CPU and * |
||
384 | * returns. NOTE: If hyperthreading is enabled, there is no guarantee that * |
||
385 | * this will work as expected and pin one thread per physical core. It * |
||
386 | * depends on how the O/S numbers the SMT cores. * |
||
387 | * * |
||
388 | ******************************************************************************* |
||
389 | */ |
||
390 | void ThreadAffinity(int cpu) { |
||
391 | #if defined(AFFINITY) |
||
392 | cpu_set_t cpuset; |
||
393 | pthread_t current_thread = pthread_self(); |
||
394 | |||
395 | if (smp_affinity >= 0) { |
||
396 | CPU_ZERO(&cpuset); |
||
397 | CPU_SET(smp_affinity_increment * (cpu + smp_affinity), &cpuset); |
||
398 | pthread_setaffinity_np(current_thread, sizeof(cpu_set_t), &cpuset); |
||
399 | } |
||
400 | #endif |
||
401 | } |
||
402 | |||
403 | /* modified 08/03/16 */ |
||
404 | /* |
||
405 | ******************************************************************************* |
||
406 | * * |
||
108 | pmbaty | 407 | * ThreadInit() is called after a process is created. Its main task is to * |
408 | * initialize the process local memory so that it will fault in and be * |
||
409 | * allocated on the local node rather than the node where the original * |
||
410 | * (first) process was running. All threads will hang here via a custom * |
||
411 | * WaitForALlThreadsInitialized() procedure so that all the local thread * |
||
412 | * blocks are usable before the search actually begins. * |
||
413 | * * |
||
414 | ******************************************************************************* |
||
415 | */ |
||
416 | void *STDCALL ThreadInit(void *t) { |
||
417 | int tid = (int64_t) t; |
||
418 | |||
419 | ThreadAffinity(tid); |
||
154 | pmbaty | 420 | #if !defined(UNIX) |
108 | pmbaty | 421 | ThreadMalloc((uint64_t) tid); |
154 | pmbaty | 422 | #endif |
108 | pmbaty | 423 | thread[tid].blocks = 0xffffffffffffffffull; |
424 | Lock(lock_smp); |
||
425 | initialized_threads++; |
||
426 | Unlock(lock_smp); |
||
427 | WaitForAllThreadsInitialized(); |
||
428 | ThreadWait(tid, (TREE *) 0); |
||
429 | Lock(lock_smp); |
||
430 | smp_threads--; |
||
431 | Unlock(lock_smp); |
||
432 | return 0; |
||
433 | } |
||
434 | |||
154 | pmbaty | 435 | /* modified 08/03/16 */ |
108 | pmbaty | 436 | /* |
437 | ******************************************************************************* |
||
438 | * * |
||
439 | * ThreadSplit() is used to determine if we should split at the current ply. * |
||
440 | * There are some basic constraints on when splits can be done, such as the * |
||
441 | * depth remaining in the search (don't split to near the tips), and have we * |
||
442 | * searched at least one move to get a score or bound (YBW condition). * |
||
443 | * * |
||
444 | * If those conditions are satisfied, AND either a thread has requested a * |
||
445 | * split OR we are far enough away from the tips of the tree to justify a * |
||
446 | * "gratuitout split" then we return "success." A "gratuitout split" is a * |
||
447 | * split done without any idle threads. Since splits are not free, we only * |
||
448 | * do this well away from tips to limit overhead. We do this so that when a * |
||
449 | * thread becomes idle, it will find these split points immediately and not * |
||
450 | * have to wait for a split after the fact. * |
||
451 | * * |
||
452 | ******************************************************************************* |
||
453 | */ |
||
154 | pmbaty | 454 | int ThreadSplit(TREE * tree, int ply, int depth, int alpha, int o_alpha, |
108 | pmbaty | 455 | int done) { |
456 | TREE *used; |
||
457 | int64_t tblocks; |
||
458 | int temp, unused = 0; |
||
459 | |||
460 | /* |
||
33 | pmbaty | 461 | ************************************************************ |
462 | * * |
||
108 | pmbaty | 463 | * First, we see if we meet the basic criteria to create a * |
464 | * split point, that being that we must not be too far * |
||
465 | * from the root (smp_min_split_depth). * |
||
33 | pmbaty | 466 | * * |
467 | ************************************************************ |
||
468 | */ |
||
154 | pmbaty | 469 | if (depth < smp_min_split_depth) |
108 | pmbaty | 470 | return 0; |
33 | pmbaty | 471 | /* |
472 | ************************************************************ |
||
473 | * * |
||
108 | pmbaty | 474 | * If smp_split is NOT set, we are checking to see if it * |
475 | * is acceptable to do a gratuitous split here. * |
||
33 | pmbaty | 476 | * * |
108 | pmbaty | 477 | * (1) if we are too far from the root we do not do * |
478 | * gratuitous splits to avoid the overhead. * |
||
479 | * * |
||
480 | * (2) if we have searched more than one move at this ply, * |
||
481 | * we don't do any further tests to see if a * |
||
482 | * gratuitous split is acceptable, since we have * |
||
483 | * previously done this test at this ply and decided * |
||
484 | * one should not be done. That condition has likely * |
||
485 | * not changed. * |
||
486 | * * |
||
487 | * (3) if we have pre-existing gratuitous split points for * |
||
488 | * this thread, we make certain we don't create more * |
||
489 | * than the gratuitous split limit as excessive splits * |
||
490 | * just add to the overhead with no benefit. * |
||
491 | * * |
||
33 | pmbaty | 492 | ************************************************************ |
493 | */ |
||
108 | pmbaty | 494 | if (!smp_split) { |
154 | pmbaty | 495 | if (depth < smp_gratuitous_depth || done > 1) |
108 | pmbaty | 496 | return 0; |
497 | tblocks = ~thread[tree->thread_id].blocks; |
||
498 | while (tblocks) { |
||
499 | temp = LSB(tblocks); |
||
500 | used = block[temp + tree->thread_id * 64 + 1]; |
||
501 | if (used->joinable && !used->joined) |
||
502 | unused++; |
||
503 | Clear(temp, tblocks); |
||
504 | } |
||
154 | pmbaty | 505 | if (unused > smp_gratuitous_limit) |
108 | pmbaty | 506 | return 0; |
507 | } |
||
508 | /* |
||
509 | ************************************************************ |
||
510 | * * |
||
511 | * If smp_split IS set, we are checking to see if it is * |
||
512 | * acceptable to do a split because there are idle threads * |
||
513 | * that need work to do. * |
||
514 | * * |
||
515 | * The only reason this would be false is if we have a * |
||
516 | * pre-existing split point that is joinable but has not * |
||
517 | * been joined. If one exists, there is no need to split * |
||
518 | * again as there is already an accessible split point. * |
||
519 | * Otherwise, if we are at the root and we are either not * |
||
520 | * allowed to split at the root, or we have additional * |
||
521 | * root moves that have to be searched one at a time using * |
||
522 | * all available threads we also can not split here. * |
||
523 | * * |
||
524 | ************************************************************ |
||
525 | */ |
||
526 | else { |
||
527 | if (ply == 1 && (!smp_split_at_root || !NextRootMoveParallel() || |
||
528 | alpha == o_alpha)) |
||
529 | return 0; |
||
530 | tblocks = ~thread[tree->thread_id].blocks; |
||
531 | while (tblocks) { |
||
532 | temp = LSB(tblocks); |
||
533 | used = block[temp + tree->thread_id * 64 + 1]; |
||
534 | if (used->joinable && !used->joined) |
||
535 | unused++; |
||
536 | Clear(temp, tblocks); |
||
537 | } |
||
154 | pmbaty | 538 | if (unused > smp_gratuitous_limit) |
108 | pmbaty | 539 | return 0; |
540 | } |
||
33 | pmbaty | 541 | return 1; |
542 | } |
||
543 | |||
154 | pmbaty | 544 | /* modified 08/03/16 */ |
33 | pmbaty | 545 | /* |
546 | ******************************************************************************* |
||
547 | * * |
||
108 | pmbaty | 548 | * ThreadStop() is called from SearchMoveList() when it detects a beta * |
549 | * cutoff (fail high) at a node that is being searched in parallel. We need * |
||
550 | * to stop all threads here, and since this threading algorithm is recursive * |
||
551 | * it may be necessary to stop other threads that are helping search this * |
||
552 | * branch further down into the tree. This function simply sets appropriate * |
||
553 | * tree->stop variables to 1, which will stop those particular threads * |
||
554 | * instantly and return them to the idle loop in ThreadWait(). * |
||
33 | pmbaty | 555 | * * |
556 | ******************************************************************************* |
||
557 | */ |
||
108 | pmbaty | 558 | void ThreadStop(TREE * RESTRICT tree) { |
559 | int proc; |
||
33 | pmbaty | 560 | |
108 | pmbaty | 561 | Lock(tree->lock); |
562 | tree->stop = 1; |
||
563 | tree->joinable = 0; |
||
154 | pmbaty | 564 | for (proc = 0; proc < smp_max_threads; proc++) |
108 | pmbaty | 565 | if (tree->siblings[proc]) |
566 | ThreadStop(tree->siblings[proc]); |
||
567 | Unlock(tree->lock); |
||
568 | } |
||
569 | |||
154 | pmbaty | 570 | /* modified 08/03/16 */ |
33 | pmbaty | 571 | /* |
108 | pmbaty | 572 | ******************************************************************************* |
573 | * * |
||
574 | * ThreadTrace() is a debugging tool that simply walks the split block tree * |
||
575 | * and displays interesting data to help debug the parallel search whenever * |
||
576 | * changes break things. * |
||
577 | * * |
||
578 | ******************************************************************************* |
||
579 | */ |
||
580 | void ThreadTrace(TREE * RESTRICT tree, int depth, int brief) { |
||
581 | int proc, i; |
||
582 | |||
583 | Lock(tree->lock); |
||
584 | Lock(lock_io); |
||
585 | if (!brief) { |
||
586 | for (i = 0; i < 4 * depth; i++) |
||
587 | Print(4095, " "); |
||
588 | depth++; |
||
589 | Print(4095, "block[%d] thread=%d ply=%d nprocs=%d ", |
||
590 | FindBlockID(tree), tree->thread_id, tree->ply, tree->nprocs); |
||
591 | Print(4095, "joined=%d joinable=%d stop=%d nodes=%d", tree->joined, |
||
592 | tree->joinable, tree->stop, tree->nodes_searched); |
||
593 | Print(4095, " parent=%d\n", FindBlockID(tree->parent)); |
||
594 | } else { |
||
595 | if (tree->nprocs > 1) { |
||
596 | for (i = 0; i < 4 * depth; i++) |
||
597 | Print(4095, " "); |
||
598 | depth++; |
||
599 | Print(4095, "(ply %d)", tree->ply); |
||
600 | } |
||
601 | } |
||
602 | if (tree->nprocs) { |
||
603 | if (!brief) { |
||
604 | for (i = 0; i < 4 * depth; i++) |
||
605 | Print(4095, " "); |
||
606 | Print(4095, " parent=%d sibling threads=", |
||
607 | FindBlockID(tree->parent)); |
||
154 | pmbaty | 608 | for (proc = 0; proc < smp_max_threads; proc++) |
108 | pmbaty | 609 | if (tree->siblings[proc]) |
610 | Print(4095, " %d(%d)", proc, FindBlockID(tree->siblings[proc])); |
||
611 | Print(4095, "\n"); |
||
612 | } else { |
||
613 | if (tree->nprocs > 1) { |
||
614 | Print(4095, " helping= "); |
||
154 | pmbaty | 615 | for (proc = 0; proc < smp_max_threads; proc++) |
108 | pmbaty | 616 | if (tree->siblings[proc]) { |
617 | if (proc == tree->thread_id) |
||
618 | Print(4095, "["); |
||
619 | Print(4095, "%d", proc); |
||
620 | if (proc == tree->thread_id) |
||
621 | Print(4095, "]"); |
||
622 | Print(4095, " "); |
||
623 | } |
||
624 | Print(4095, "\n"); |
||
625 | } |
||
626 | } |
||
627 | } |
||
628 | Unlock(lock_io); |
||
154 | pmbaty | 629 | for (proc = 0; proc < smp_max_threads; proc++) |
108 | pmbaty | 630 | if (tree->siblings[proc]) |
631 | ThreadTrace(tree->siblings[proc], depth, brief); |
||
632 | Unlock(tree->lock); |
||
633 | } |
||
634 | |||
154 | pmbaty | 635 | /* modified 08/03/16 */ |
108 | pmbaty | 636 | /* |
637 | ******************************************************************************* |
||
638 | * * |
||
639 | * ThreadWait() is the idle loop for the N threads that are created at the * |
||
640 | * beginning when Crafty searches. Threads are "parked" here waiting on a * |
||
641 | * pointer to something they should search (a parameter block built in the * |
||
642 | * function Split() in this case. When this pointer becomes non-zero, each * |
||
643 | * thread "parked" here will immediately call SearchMoveList() and begin the * |
||
644 | * parallel search as directed. * |
||
645 | * * |
||
646 | * Upon entry, all threads except for the "master" will arrive here with a * |
||
647 | * value of zero (0) in the waiting parameter below. This indicates that * |
||
648 | * they will search and them be done. The "master" will arrive here with a * |
||
649 | * pointer to the parent split block in "waiting" which says I will sit here * |
||
650 | * waiting for work OR when the waiting split block has no threads working * |
||
651 | * on it, at which point I should return which will let me "unsplit" here * |
||
652 | * and clean things up. The call to here in Split() passes this block * |
||
653 | * address while threads that are helping get here with a zero. * |
||
654 | * * |
||
655 | ******************************************************************************* |
||
656 | */ |
||
657 | int ThreadWait(int tid, TREE * RESTRICT waiting) { |
||
658 | int value, tstart, tend; |
||
659 | |||
660 | /* |
||
33 | pmbaty | 661 | ************************************************************ |
662 | * * |
||
108 | pmbaty | 663 | * When we reach this point, one of three possible * |
664 | * conditions is true (1) we already have work to do, as * |
||
665 | * we are the "master thread" and we have already split * |
||
666 | * the tree, we are coming here to join in; (2) we are * |
||
667 | * the master, and we are waiting on our split point to * |
||
668 | * complete, so we come here to join and help currently * |
||
669 | * active threads; (3) we have no work to do, so we will * |
||
670 | * spin until Join() locates a split pont we can join to * |
||
671 | * help out. * |
||
33 | pmbaty | 672 | * * |
108 | pmbaty | 673 | * Note that when we get here, the parent already has a * |
674 | * split block and does not need to call Join(), it simply * |
||
675 | * falls through the while spin loop below because its * |
||
676 | * "tree" pointer is already non-zero. * |
||
677 | * * |
||
33 | pmbaty | 678 | ************************************************************ |
679 | */ |
||
154 | pmbaty | 680 | while (FOREVER) { |
108 | pmbaty | 681 | tstart = ReadClock(); |
682 | while (!thread[tid].tree && (!waiting || waiting->nprocs) && !Join(tid) && |
||
154 | pmbaty | 683 | !thread[tid].terminate); |
108 | pmbaty | 684 | tend = ReadClock(); |
685 | if (!thread[tid].tree) |
||
686 | thread[tid].tree = waiting; |
||
687 | thread[tid].idle += tend - tstart; |
||
688 | if (thread[tid].tree == waiting || thread[tid].terminate) |
||
689 | return 0; |
||
690 | /* |
||
691 | ************************************************************ |
||
692 | * * |
||
693 | * Once we get here, we have a good split point, so we are * |
||
694 | * ready to participate in a parallel search. Once we * |
||
695 | * return from SearchMoveList() we copy our results back * |
||
696 | * to the parent via CopyToParent() before we look for a * |
||
697 | * new split point. If we are a parent, we will slip out * |
||
698 | * of the spin loop at the top and return to the normal * |
||
699 | * serial search to finish up here. * |
||
700 | * * |
||
701 | * When we return from SearchMoveList(), we need to * |
||
702 | * decrement the "nprocs" value since there is now one * |
||
703 | * less thread working at this split point. * |
||
704 | * * |
||
705 | * Note: CopyToParent() marks the current split block as * |
||
706 | * unused once the copy is completed, so we don't have to * |
||
707 | * do anything about that here. * |
||
708 | * * |
||
709 | ************************************************************ |
||
710 | */ |
||
711 | value = |
||
712 | SearchMoveList(thread[tid].tree, thread[tid].tree->ply, |
||
713 | thread[tid].tree->depth, thread[tid].tree->wtm, |
||
714 | thread[tid].tree->alpha, thread[tid].tree->beta, |
||
715 | thread[tid].tree->searched, thread[tid].tree->in_check, 0, parallel); |
||
716 | tstart = ReadClock(); |
||
717 | Lock(thread[tid].tree->parent->lock); |
||
718 | thread[tid].tree->parent->joinable = 0; |
||
719 | CopyToParent((TREE *) thread[tid].tree->parent, thread[tid].tree, value); |
||
720 | thread[tid].tree->parent->nprocs--; |
||
721 | thread[tid].tree->parent->siblings[tid] = 0; |
||
722 | Unlock(thread[tid].tree->parent->lock); |
||
723 | thread[tid].tree = 0; |
||
724 | tend = ReadClock(); |
||
725 | thread[tid].idle += tend - tstart; |
||
33 | pmbaty | 726 | } |
108 | pmbaty | 727 | } |
728 | |||
154 | pmbaty | 729 | /* modified 08/03/16 */ |
33 | pmbaty | 730 | /* |
108 | pmbaty | 731 | ******************************************************************************* |
732 | * * |
||
733 | * CopyFromParent() is used to copy data from a parent thread to a child * |
||
734 | * thread. This only copies the appropriate parts of the TREE structure to * |
||
735 | * avoid burning memory bandwidth by copying everything. * |
||
736 | * * |
||
737 | ******************************************************************************* |
||
738 | */ |
||
739 | void CopyFromParent(TREE * RESTRICT child) { |
||
740 | TREE *parent = child->parent; |
||
741 | int i, ply; |
||
742 | |||
743 | /* |
||
33 | pmbaty | 744 | ************************************************************ |
745 | * * |
||
746 | * We have allocated a split block. Now we copy the tree * |
||
747 | * search state from the parent block to the child in * |
||
748 | * preparation for starting the parallel search. * |
||
749 | * * |
||
750 | ************************************************************ |
||
751 | */ |
||
108 | pmbaty | 752 | ply = parent->ply; |
33 | pmbaty | 753 | child->ply = ply; |
754 | child->position = parent->position; |
||
108 | pmbaty | 755 | for (i = 0; i <= rep_index + parent->ply; i++) |
33 | pmbaty | 756 | child->rep_list[i] = parent->rep_list[i]; |
108 | pmbaty | 757 | for (i = ply - 1; i < MAXPLY; i++) |
758 | child->killers[i] = parent->killers[i]; |
||
154 | pmbaty | 759 | for (i = 0; i < 4096; i++) { |
760 | child->counter_move[i] = parent->counter_move[i]; |
||
761 | child->move_pair[i] = parent->move_pair[i]; |
||
762 | } |
||
108 | pmbaty | 763 | for (i = ply - 1; i <= ply; i++) { |
33 | pmbaty | 764 | child->curmv[i] = parent->curmv[i]; |
108 | pmbaty | 765 | child->pv[i] = parent->pv[i]; |
33 | pmbaty | 766 | } |
108 | pmbaty | 767 | child->in_check = parent->in_check; |
768 | child->last[ply] = child->move_list; |
||
769 | child->status[ply] = parent->status[ply]; |
||
770 | child->status[1] = parent->status[1]; |
||
771 | child->save_hash_key[ply] = parent->save_hash_key[ply]; |
||
772 | child->save_pawn_hash_key[ply] = parent->save_pawn_hash_key[ply]; |
||
33 | pmbaty | 773 | child->nodes_searched = 0; |
774 | child->fail_highs = 0; |
||
775 | child->fail_high_first_move = 0; |
||
776 | child->evaluations = 0; |
||
777 | child->egtb_probes = 0; |
||
108 | pmbaty | 778 | child->egtb_hits = 0; |
33 | pmbaty | 779 | child->extensions_done = 0; |
780 | child->qchecks_done = 0; |
||
781 | child->moves_fpruned = 0; |
||
108 | pmbaty | 782 | child->moves_mpruned = 0; |
783 | for (i = 0; i < 16; i++) { |
||
784 | child->LMR_done[i] = 0; |
||
785 | child->null_done[i] = 0; |
||
786 | } |
||
33 | pmbaty | 787 | child->alpha = parent->alpha; |
788 | child->beta = parent->beta; |
||
789 | child->value = parent->value; |
||
108 | pmbaty | 790 | child->wtm = parent->wtm; |
33 | pmbaty | 791 | child->depth = parent->depth; |
108 | pmbaty | 792 | child->searched = parent->searched; |
793 | strcpy(child->root_move_text, parent->root_move_text); |
||
794 | strcpy(child->remaining_moves_text, parent->remaining_moves_text); |
||
33 | pmbaty | 795 | } |
796 | |||
154 | pmbaty | 797 | /* modified 08/03/16 */ |
33 | pmbaty | 798 | /* |
799 | ******************************************************************************* |
||
800 | * * |
||
801 | * CopyToParent() is used to copy data from a child thread to a parent * |
||
802 | * thread. This only copies the appropriate parts of the TREE structure to * |
||
803 | * avoid burning memory bandwidth by copying everything. * |
||
804 | * * |
||
805 | ******************************************************************************* |
||
806 | */ |
||
807 | void CopyToParent(TREE * RESTRICT parent, TREE * RESTRICT child, int value) { |
||
108 | pmbaty | 808 | int i, ply = parent->ply, which; |
33 | pmbaty | 809 | |
810 | /* |
||
811 | ************************************************************ |
||
812 | * * |
||
108 | pmbaty | 813 | * The only concern here is to make sure that the info is * |
814 | * only copied to the parent if our score is > than the * |
||
815 | * parent value, and that we were not stopped for any * |
||
816 | * reason which could produce a partial score that is * |
||
817 | * worthless and dangerous to use. * |
||
33 | pmbaty | 818 | * * |
108 | pmbaty | 819 | * One important special case. If we get here with the * |
820 | * thread->stop flag set, and ply is 1, then we need to * |
||
821 | * clear the "this move has been searched" flag in the ply * |
||
822 | * 1 move list since we did not complete the search. If * |
||
823 | * we fail to do this, then a move being searched in * |
||
824 | * parallel at the root will be "lost" for this iteration * |
||
825 | * and won't be searched again until the next iteration. * |
||
826 | * * |
||
33 | pmbaty | 827 | * In any case, we add our statistical counters to the * |
828 | * parent's totals no matter whether we finished or not * |
||
829 | * since the total nodes searched and such should consider * |
||
830 | * everything searched, not just the "useful stuff." * |
||
831 | * * |
||
108 | pmbaty | 832 | * After we finish copying everything, we mark this split * |
833 | * block as free in the split block bitmap. * |
||
834 | * * |
||
33 | pmbaty | 835 | ************************************************************ |
836 | */ |
||
837 | if (child->nodes_searched && !child->stop && value > parent->value && |
||
838 | !abort_search) { |
||
839 | parent->pv[ply] = child->pv[ply]; |
||
840 | parent->value = value; |
||
841 | parent->cutmove = child->curmv[ply]; |
||
154 | pmbaty | 842 | for (i = 0; i < 4096; i++) { |
843 | parent->counter_move[i] = child->counter_move[i]; |
||
844 | parent->move_pair[i] = child->move_pair[i]; |
||
845 | } |
||
33 | pmbaty | 846 | } |
108 | pmbaty | 847 | if (child->stop && ply == 1) |
848 | for (which = 0; which < n_root_moves; which++) |
||
849 | if (root_moves[which].move == child->curmv[ply]) { |
||
850 | root_moves[which].status &= 7; |
||
851 | break; |
||
852 | } |
||
33 | pmbaty | 853 | parent->nodes_searched += child->nodes_searched; |
854 | parent->fail_highs += child->fail_highs; |
||
855 | parent->fail_high_first_move += child->fail_high_first_move; |
||
856 | parent->evaluations += child->evaluations; |
||
857 | parent->egtb_probes += child->egtb_probes; |
||
108 | pmbaty | 858 | parent->egtb_hits += child->egtb_hits; |
33 | pmbaty | 859 | parent->extensions_done += child->extensions_done; |
860 | parent->qchecks_done += child->qchecks_done; |
||
861 | parent->moves_fpruned += child->moves_fpruned; |
||
108 | pmbaty | 862 | parent->moves_mpruned += child->moves_mpruned; |
863 | for (i = 1; i < 16; i++) { |
||
864 | parent->LMR_done[i] += child->LMR_done[i]; |
||
865 | parent->null_done[i] += child->null_done[i]; |
||
866 | } |
||
867 | which = FindBlockID(child) - 64 * child->thread_id - 1; |
||
868 | Set(which, thread[child->thread_id].blocks); |
||
33 | pmbaty | 869 | } |
870 | |||
154 | pmbaty | 871 | /* modified 08/03/16 */ |
33 | pmbaty | 872 | /* |
873 | ******************************************************************************* |
||
874 | * * |
||
108 | pmbaty | 875 | * GetBlock() is used to allocate a split block and fill in only SMP- * |
876 | * critical information. The child process will copy the rest of the split * |
||
877 | * block information as needed. * |
||
33 | pmbaty | 878 | * * |
108 | pmbaty | 879 | * When we arrive here, the parent split block must be locked since we are * |
880 | * going to change data in that block as well as copy data from that block * |
||
881 | * the current split block. The only exception is when this is the original * |
||
882 | * split operation, since this is done "out of sight" of other threads which * |
||
883 | * means no locks are needed until after the "joinable" flag is set, which * |
||
884 | * exposes this split point to other threads instantly. * |
||
885 | * * |
||
33 | pmbaty | 886 | ******************************************************************************* |
887 | */ |
||
154 | pmbaty | 888 | TREE *GetBlock(TREE * RESTRICT parent, int tid) { |
108 | pmbaty | 889 | TREE *child; |
890 | static int warnings = 0; |
||
891 | int i, unused; |
||
892 | /* |
||
893 | ************************************************************ |
||
894 | * * |
||
895 | * One NUMA-related trick is that we only allocate a split * |
||
896 | * block in the thread's local memory. Each thread has a * |
||
897 | * group of split blocks that were first touched by the * |
||
898 | * correct CPU so that the split blocks page-faulted into * |
||
899 | * local memory for that specific processor. If we can't * |
||
900 | * find an optimally-placed block, we return a zero which * |
||
901 | * will prevent this thread from joining the split point. * |
||
902 | * This is highly unlikely as it would mean the current * |
||
903 | * thread has 64 active split blocks where it is waiting * |
||
904 | * on other threads to complete the last bit of work at * |
||
905 | * each. This is extremely unlikely. * |
||
906 | * * |
||
907 | * Here we use a simple 64 bit bit-map per thread that * |
||
908 | * indicates which blocks are free (1) and which blocks * |
||
909 | * used (0). We simply use LSB() to find the rightmost * |
||
910 | * one-bit set and that is the local block number. We * |
||
911 | * convert that to a global block number by doing the * |
||
912 | * simple computation: * |
||
913 | * * |
||
914 | * global = local + 64 * tid + 1 * |
||
915 | * * |
||
916 | * Each thread has exactly 64 split blocks, and block 0 * |
||
917 | * is the "master block" that never gets allocated or * |
||
918 | * freed. Once we find a free block for the current * |
||
919 | * thread, we zero that bit so that the block won't be * |
||
920 | * used again until it is released. * |
||
921 | * * |
||
922 | ************************************************************ |
||
923 | */ |
||
924 | if (thread[tid].blocks) { |
||
925 | unused = LSB(thread[tid].blocks); |
||
926 | Clear(unused, thread[tid].blocks); |
||
927 | Set(unused, thread[tid].max_blocks); |
||
928 | } else { |
||
929 | if (++warnings < 6) |
||
930 | Print(2048, |
||
931 | "WARNING. local SMP block cannot be allocated, thread %d\n", tid); |
||
932 | return 0; |
||
933 | } |
||
934 | child = block[unused + tid * 64 + 1]; |
||
935 | /* |
||
936 | ************************************************************ |
||
937 | * * |
||
938 | * Found a split block. Now we need to fill in only the * |
||
939 | * critical information that can't be delayed due to race * |
||
940 | * conditions. When we get here, the parent split block * |
||
941 | * must be locked, that lets us safely update the number * |
||
942 | * of processors working here, etc, without any ugly race * |
||
943 | * conditions that would corrupt this critical data. * |
||
944 | * * |
||
945 | ************************************************************ |
||
946 | */ |
||
154 | pmbaty | 947 | for (i = 0; i < smp_max_threads; i++) |
108 | pmbaty | 948 | child->siblings[i] = 0; |
949 | child->nprocs = 0; |
||
950 | child->stop = 0; |
||
951 | child->joinable = 0; |
||
952 | child->joined = 0; |
||
953 | child->parent = parent; |
||
954 | child->thread_id = tid; |
||
955 | parent->nprocs++; |
||
956 | parent->siblings[tid] = child; |
||
957 | parent->joined = 1; |
||
958 | return child; |
||
33 | pmbaty | 959 | } |
960 | |||
961 | /* |
||
962 | ******************************************************************************* |
||
963 | * * |
||
108 | pmbaty | 964 | * WaitForAllThreadsInitialized() waits until all smp_max_threads are * |
965 | * initialized. We have to initialize each thread and malloc() its split * |
||
966 | * blocks before we start the actual parallel search. Otherwise we will see * |
||
967 | * invalid memory accesses and crash instantly. * |
||
33 | pmbaty | 968 | * * |
969 | ******************************************************************************* |
||
970 | */ |
||
108 | pmbaty | 971 | void WaitForAllThreadsInitialized(void) { |
154 | pmbaty | 972 | while (initialized_threads < smp_max_threads); /* Do nothing */ |
33 | pmbaty | 973 | } |
974 | |||
154 | pmbaty | 975 | #if !defined (UNIX) |
976 | /* modified 08/03/16 */ |
||
33 | pmbaty | 977 | /* |
978 | ******************************************************************************* |
||
979 | * * |
||
980 | * ThreadMalloc() is called from the ThreadInit() function. It malloc's the * |
||
981 | * split blocks in the local memory for the processor associated with the * |
||
982 | * specific thread that is calling this code. * |
||
983 | * * |
||
984 | ******************************************************************************* |
||
985 | */ |
||
986 | extern void *WinMalloc(size_t, int); |
||
987 | void ThreadMalloc(int64_t tid) { |
||
108 | pmbaty | 988 | int i; |
33 | pmbaty | 989 | |
154 | pmbaty | 990 | for (i = tid * 64 + 1; i < tid * 64 + 65; i++) { |
33 | pmbaty | 991 | if (block[i] == NULL) |
992 | block[i] = |
||
993 | (TREE *) ((~(size_t) 127) & (127 + (size_t) WinMalloc(sizeof(TREE) + |
||
154 | pmbaty | 994 | 127, tid))); |
33 | pmbaty | 995 | block[i]->parent = NULL; |
996 | LockInit(block[i]->lock); |
||
997 | } |
||
998 | } |
||
999 | #endif |