Rev 108 | Go to most recent revision | Details | Last modification | View Log | RSS feed
Rev | Author | Line No. | Line |
---|---|---|---|
33 | pmbaty | 1 | #include "chess.h" |
2 | #include "data.h" |
||
3 | #include "epdglue.h" |
||
4 | /* modified 04/30/14 */ |
||
5 | /* |
||
6 | ******************************************************************************* |
||
7 | * * |
||
8 | * Thread() is the driver for the threaded parallel search in Crafty. The * |
||
9 | * basic idea is that whenever we notice that one (or more) threads are in * |
||
10 | * their idle loop, we drop into thread, from Search(), and begin a new * |
||
11 | * parallel search from this node. This is simply a problem of copying the * |
||
12 | * search state space for each thread working at this node, then sending * |
||
13 | * everyone off to SearchParallel() to search this node in parallel. * |
||
14 | * * |
||
15 | * There are a number of settable options via the command-line or .craftyrc * |
||
16 | * initialization file. Here's a concise explanation for each option and an * |
||
17 | * occasional suggestion for testing/tuning. * |
||
18 | * * |
||
19 | * smp_max_threads (command = smpmt=n) sets the total number of allowable * |
||
20 | * threads for the search. The default is one (1) as Crafty does not * |
||
21 | * assume it should use all available resources. For optimal performance * |
||
22 | * this should be set to the number of physical cores your machine has, * |
||
23 | * which does NOT include hyperthreaded cores. * |
||
24 | * * |
||
25 | * smp_split_group (command = smpgroup=n) sets the maximum number of threads * |
||
26 | * at any single split point, with the exception of split points where * |
||
27 | * ply <= 4 where ALL threads are allowed to split together ignoring this * |
||
28 | * limit. * |
||
29 | * * |
||
30 | * smp_min_split_depth (command = smpmin=n) avoids splitting when remaining * |
||
31 | * depth < n. This is used to balance splitting overhead cost against * |
||
32 | * the speed gain the parallel search produes. The default is currently * |
||
33 | * 5 (which could change with future generations of Intel hardware) but * |
||
34 | * values between 4 and 8 will work. Larger values allow somewhat fewer * |
||
35 | * splits, which reduces overhead, but it also increases the percentage * |
||
36 | * of the time where a thread is waiting on work. * |
||
37 | * * |
||
38 | * smp_split_nodes (command = smpnodes=n) will not allow a thread to split * |
||
39 | * until it has searched at least N nodes, to prevent excessive splitting * |
||
40 | * out near the tips in endgames. The default is 2000. Larger values * |
||
41 | * will reduce splitting overhead but will increase the amoutn of time * |
||
42 | * a thread is waiting on work. * |
||
43 | * * |
||
44 | * smp_split_at_root (command=smproot=0 or 1) enables (1) or disables (0) * |
||
45 | * splitting the tree at the root. This defaults to 1 which produces the * |
||
46 | * best performance by a signficiant margin. But it can be disabled if * |
||
47 | * you are playing with code changes. * |
||
48 | * * |
||
49 | * The best way to tune any of these parameters is to run SEVERAL test cases * |
||
50 | * (positions) with max threads set. Run each set of positions several * |
||
51 | * times with each parameter change you want to try (do them ONE at a time * |
||
52 | * for obvious reasons). Pick the value with the smallest overall search * |
||
53 | * time. The more cores you use, the more times you should run each test, * |
||
54 | * since parallel search is highly non-deterministic and you need several * |
||
55 | * runs to get a reasonable average. * |
||
56 | * * |
||
57 | * A few basic "rules of the road" for anyone interested in changing or * |
||
58 | * adding to any of this code. * |
||
59 | * * |
||
60 | * 1. If, at any time, you want to modify your private split block, no lock * |
||
61 | * is required. * |
||
62 | * * |
||
63 | * 2. If, at any time, you want to modify another split block, such as the * |
||
64 | * parent split block shared move list, you must acquire the lock in the * |
||
65 | * split block first. IE (tree->parent->lock to lock the parent split * |
||
66 | * block during NextMove() and such). Note that this ONLY applies to * |
||
67 | * search-related data, NOT the SMP data within the split block (parent * |
||
68 | * pointer, sibling pointers, number of processors working here, etc). * |
||
69 | * Modifying those falls under the next lock below. * |
||
70 | * * |
||
71 | * 3. If you want to modify any SMP-related data, such as allocating a * |
||
72 | * split block, or telling sibling processes to stop, etc, then you must * |
||
73 | * acquire the global "smp_lock" lock first. This prevents any sort of * |
||
74 | * race condition that could corrupt the split block tree organization. * |
||
75 | * This applies to ANY smp-related variable from the simple smp_idle * |
||
76 | * variable through the various sibling chains and such. * |
||
77 | * * |
||
78 | * 4. If you want to do any sort of I/O operation, you must acquire the * |
||
79 | * "lock_io" lock first. Since threads share descriptors, there are * |
||
80 | * lots of potential race conditions, from the simple tty-output getting * |
||
81 | * interlaced from different threads, to outright data corruption in the * |
||
82 | * book or log files. * |
||
83 | * * |
||
84 | * 5. There is a specific "lock_split" that is used to prevent a lot of * |
||
85 | * queued waiting to do splits, but it is unlikely that lock would ever * |
||
86 | * be used anywhere other than where it is currently accessed in * |
||
87 | * Search() at the parallel split initiation code. * |
||
88 | * * |
||
89 | * 6. If you want to alter the root move list, you must first acquire * |
||
90 | * lock_root, since the root move list is shared and multiple threads * |
||
91 | * can attempt to modify it at the same time. Overlooking this can * |
||
92 | * result in a corrupted root move list with one or more moves missing * |
||
93 | * or duplicated. * |
||
94 | * * |
||
95 | * Some of the bugs caused by failing to acquire the correct lock will only * |
||
96 | * occur infrequently, and they are extremely difficult to find. Some only * |
||
97 | * show up in a public game where everyone is watching, to cause maximum * |
||
98 | * embarassment and causes the program to do something extremely stupid. * |
||
99 | * * |
||
100 | ******************************************************************************* |
||
101 | */ |
||
102 | int Thread(TREE * RESTRICT tree) { |
||
103 | int proc, tsplit = 0, nblocks = 0; |
||
104 | |||
105 | /* |
||
106 | ************************************************************ |
||
107 | * * |
||
108 | * First, we make sure that there are idle threads. It is * |
||
109 | * possible that we get here after they have all been * |
||
110 | * claimed by another thread. In that case we return. * |
||
111 | * * |
||
112 | ************************************************************ |
||
113 | */ |
||
114 | Lock(lock_smp); |
||
115 | for (proc = 0; proc < smp_max_threads && !thread[proc].idle; proc++); |
||
116 | if (proc == smp_max_threads) { |
||
117 | smp_split = 0; |
||
118 | Unlock(lock_smp); |
||
119 | return 0; |
||
120 | } |
||
121 | /* |
||
122 | ************************************************************ |
||
123 | * * |
||
124 | * Now we start copying the current "state" from this * |
||
125 | * thread into new thread blocks so that the threads can * |
||
126 | * search this position without interfering with each * |
||
127 | * other. As we copy, we link those blocks as siblings of * |
||
128 | * the parent node, and point each sibling back to the * |
||
129 | * parent so we can unwind this confusion as the threads * |
||
130 | * complete their work. * |
||
131 | * * |
||
132 | * CopyToChild() allocates a split block optimally (see * |
||
133 | * that code for an explanation) and then copies anything * |
||
134 | * necessary for the parallel search. * |
||
135 | * * |
||
136 | * Special case 1: In the loop to allocate a split block * |
||
137 | * and copy current data to the new child thread, we skip * |
||
138 | * over the current splitting thread's data. After the * |
||
139 | * loop we then explicitly allocate a block for this * |
||
140 | * thread so that it will be included in the thread group * |
||
141 | * (remember that this thread is the one that has to back * |
||
142 | * up through the split block, so it will always call * |
||
143 | * ThreadWait() telling it to do so. There was a peculiar * |
||
144 | * condition that caused an inefficiency. If we are at * |
||
145 | * a depth where we honor the smp_split_group limit, it is * |
||
146 | * possible that on occasion, there could be so many idle * |
||
147 | * processors that we would not include ourself. For * |
||
148 | * example, suppose we are thread 6, and when we enter * |
||
149 | * Thread() to do the split, threads 0-5 are idle, and * |
||
150 | * smp_split_group = 6. We would pick up the first 6 idle * |
||
151 | * threads and sic them on this split point, but we would * |
||
152 | * fail to include ourself. We would hit ThreadWait() * |
||
153 | * with no work to do, and then split somewhere else. * |
||
154 | * Unfortunately, THIS thread is the one that needs to * |
||
155 | * wait for all the other threads to complete so that it * |
||
156 | * can back up to the previous ply. But if it is busy on * |
||
157 | * another split point, it won't be able to return to this * |
||
158 | * point until it is idle. Which means progress at this * |
||
159 | * split point is held up. We now forcefully include the * |
||
160 | * current thread, and only include smp_split_group - 1 * |
||
161 | * other threads to work here with this thread. * |
||
162 | * * |
||
163 | * Special case 2: it is possible that there are no split * |
||
164 | * blocks available. In that case, CopyToChild() will * |
||
165 | * terminate Crafty with a log-file error message telling * |
||
166 | * the user to re-compile with a larger number of split * |
||
167 | * blocks (the probability of this actually happening is * |
||
168 | * incredibly small, but it is non-zero. We use a VERY * |
||
169 | * liberal estimate for the number of split blocks to * |
||
170 | * allocate to avoid this.) * |
||
171 | * * |
||
172 | ************************************************************ |
||
173 | */ |
||
174 | thread[tree->thread_id].tree = 0; |
||
175 | tree->nprocs = 0; |
||
176 | for (proc = 0; proc < smp_max_threads; proc++) { |
||
177 | tree->siblings[proc] = 0; |
||
178 | if (thread[proc].idle) { |
||
179 | CopyToChild(tree, proc); |
||
180 | nblocks++; |
||
181 | if (nblocks >= smp_split_group && tree->ply > tree->depth / 2) |
||
182 | break; |
||
183 | } |
||
184 | } |
||
185 | CopyToChild(tree, tree->thread_id); |
||
186 | nblocks++; |
||
187 | /* |
||
188 | ************************************************************ |
||
189 | * * |
||
190 | * Everything is set. Now we can stuff the address of the * |
||
191 | * thread blocks into thread[i].tree so that those idle * |
||
192 | * threads can begin the parallel search. We also flag * |
||
193 | * them as "not idle" so that a split immediately after we * |
||
194 | * exit won't try to pick them up again, breaking things. * |
||
195 | * * |
||
196 | ************************************************************ |
||
197 | */ |
||
198 | parallel_splits++; |
||
199 | for (proc = 0; proc < smp_max_threads; proc++) |
||
200 | if (tree->siblings[proc]) { |
||
201 | thread[proc].tree = tree->siblings[proc]; |
||
202 | thread[proc].idle = 0; |
||
203 | } |
||
204 | /* |
||
205 | ************************************************************ |
||
206 | * * |
||
207 | * One final test before leaving. We test all threads to * |
||
208 | * determine if any are still idle (most likely due to the * |
||
209 | * smp_split_group limit, but a thread could go idle after * |
||
210 | * our loop to pick up idle threads). If so we set split * |
||
211 | * back to one (actually a temporary split variable since * |
||
212 | * we don't want to remove the split=-1 state until just * |
||
213 | * before we exit to avoid any races.) * |
||
214 | * * |
||
215 | ************************************************************ |
||
216 | */ |
||
217 | for (proc = 0; proc < smp_max_threads; proc++) |
||
218 | if (thread[proc].idle) |
||
219 | tsplit = 1; |
||
220 | /* |
||
221 | ************************************************************ |
||
222 | * * |
||
223 | * After the threads are kicked off to start the parallel * |
||
224 | * search using the idle threads, this thread has to be * |
||
225 | * inserted as well. However, since it is possible that * |
||
226 | * this thread may finish before any or all of the other * |
||
227 | * parallel threads, this thread is sent to ThreadWait() * |
||
228 | * which will immediately send it to SearchParallel() like * |
||
229 | * the other threads. Going to ThreadWait() allows this * |
||
230 | * thread to join others if it runs out of work to do. We * |
||
231 | * do pass ThreadWait() the address of the parent thread * |
||
232 | * block, so that if this thread becomes idle, and this * |
||
233 | * thread block shows no threads are still busy, then this * |
||
234 | * thread can return to here and then back up into the * |
||
235 | * previous ply as it should. Note that no other thread * |
||
236 | * can back up to the previous ply since their recursive * |
||
237 | * call stacks are not set for that. * |
||
238 | * * |
||
239 | ************************************************************ |
||
240 | */ |
||
241 | smp_split = tsplit; |
||
242 | Unlock(lock_smp); |
||
243 | ThreadWait(tree->thread_id, tree); |
||
244 | return 1; |
||
245 | } |
||
246 | |||
247 | /* modified 04/30/14 */ |
||
248 | /* |
||
249 | ******************************************************************************* |
||
250 | * * |
||
251 | * CopyToChild() is used to copy data from a parent thread to a particular * |
||
252 | * child thread. This only copies the appropriate parts of the TREE * |
||
253 | * structure to avoid burning memory bandwidth by copying everything. * |
||
254 | * * |
||
255 | ******************************************************************************* |
||
256 | */ |
||
257 | void CopyToChild(TREE * RESTRICT parent, int thread) { |
||
258 | int i, j, /*max, */ply = parent->ply; |
||
259 | unsigned int max_; // Pierre-Marie Baty -- should be unsigned |
||
260 | TREE *child; |
||
261 | static int warnings = 0; |
||
262 | int first = thread * MAX_BLOCKS_PER_CPU + 1; |
||
263 | int last = first + MAX_BLOCKS_PER_CPU; |
||
264 | int maxb = smp_max_threads * MAX_BLOCKS_PER_CPU + 1; |
||
265 | |||
266 | /* |
||
267 | ************************************************************ |
||
268 | * * |
||
269 | * One NUMA-related trick is that we first try to allocate * |
||
270 | * a split block in the thread's local memory. Each * |
||
271 | * thread has a group of split blocks that were first * |
||
272 | * touched by the correct CPU so that the split blocks * |
||
273 | * page faulted into local memory for that specific * |
||
274 | * processor. If we can't find an optimal-placed block, * |
||
275 | * the second pass will find the first available block. * |
||
276 | * If none can be found, we return as we can not split * |
||
277 | * until other split blocks are freed up. * |
||
278 | * * |
||
279 | ************************************************************ |
||
280 | */ |
||
281 | for (i = first; i < last && block[i]->used; i++); |
||
282 | if (i >= last) { |
||
283 | if (++warnings < 6) |
||
284 | Print(128, |
||
285 | "WARNING. optimal SMP block cannot be allocated, thread %d\n", |
||
286 | thread); |
||
287 | for (i = 1; i < maxb && block[i]->used; i++); |
||
288 | if (i >= maxb) { |
||
289 | Print(128, "ERROR. no SMP block can be allocated\n"); |
||
290 | exit(1); |
||
291 | } |
||
292 | } |
||
293 | max_ = 0; |
||
294 | for (j = 1; j < maxb; j++) |
||
295 | if (block[j]->used) |
||
296 | max_++; |
||
297 | max_split_blocks = Max(max_split_blocks, max_); |
||
298 | /* |
||
299 | ************************************************************ |
||
300 | * * |
||
301 | * We have allocated a split block. Now we copy the tree * |
||
302 | * search state from the parent block to the child in * |
||
303 | * preparation for starting the parallel search. * |
||
304 | * * |
||
305 | ************************************************************ |
||
306 | */ |
||
307 | child = block[i]; |
||
308 | child->used = 1; |
||
309 | child->stop = 0; |
||
310 | for (i = 0; i < smp_max_threads; i++) |
||
311 | child->siblings[i] = 0; |
||
312 | child->nprocs = 0; |
||
313 | child->ply = ply; |
||
314 | child->position = parent->position; |
||
315 | child->pv[ply - 1] = parent->pv[ply - 1]; |
||
316 | child->pv[ply] = parent->pv[ply]; |
||
317 | child->next_status[ply] = parent->next_status[ply]; |
||
318 | child->save_hash_key[ply] = parent->save_hash_key[ply]; |
||
319 | child->save_pawn_hash_key[ply] = parent->save_pawn_hash_key[ply]; |
||
320 | child->rep_index = parent->rep_index; |
||
321 | for (i = 0; i <= parent->rep_index + parent->ply; i++) |
||
322 | child->rep_list[i] = parent->rep_list[i]; |
||
323 | child->last[ply] = child->move_list; |
||
324 | child->hash_move[ply] = parent->hash_move[ply]; |
||
325 | for (i = 1; i <= ply + 1; i++) { |
||
326 | child->status[i] = parent->status[i]; |
||
327 | child->curmv[i] = parent->curmv[i]; |
||
328 | child->inchk[i] = parent->inchk[i]; |
||
329 | child->phase[i] = parent->phase[i]; |
||
330 | } |
||
331 | for (i = 1; i < MAXPLY; i++) |
||
332 | child->killers[i] = parent->killers[i]; |
||
333 | child->nodes_searched = 0; |
||
334 | child->fail_highs = 0; |
||
335 | child->fail_high_first_move = 0; |
||
336 | child->evaluations = 0; |
||
337 | child->egtb_probes = 0; |
||
338 | child->egtb_probes_successful = 0; |
||
339 | child->extensions_done = 0; |
||
340 | child->qchecks_done = 0; |
||
341 | child->reductions_done = 0; |
||
342 | child->moves_fpruned = 0; |
||
343 | child->alpha = parent->alpha; |
||
344 | child->beta = parent->beta; |
||
345 | child->value = parent->value; |
||
346 | child->side = parent->side; |
||
347 | child->depth = parent->depth; |
||
348 | parent->siblings[thread] = child; |
||
349 | child->thread_id = thread; |
||
350 | child->parent = parent; |
||
351 | parent->nprocs++; |
||
352 | strcpy_s(child->root_move_text, sizeof (child->root_move_text), parent->root_move_text); // Pierre-Marie Baty -- use safe version |
||
353 | strcpy_s(child->remaining_moves_text, sizeof (child->remaining_moves_text), parent->remaining_moves_text); // Pierre-Marie Baty -- use safe version |
||
354 | } |
||
355 | |||
356 | /* modified 04/18/14 */ |
||
357 | /* |
||
358 | ******************************************************************************* |
||
359 | * * |
||
360 | * CopyToParent() is used to copy data from a child thread to a parent * |
||
361 | * thread. This only copies the appropriate parts of the TREE structure to * |
||
362 | * avoid burning memory bandwidth by copying everything. * |
||
363 | * * |
||
364 | ******************************************************************************* |
||
365 | */ |
||
366 | void CopyToParent(TREE * RESTRICT parent, TREE * RESTRICT child, int value) { |
||
367 | int ply = parent->ply; |
||
368 | |||
369 | /* |
||
370 | ************************************************************ |
||
371 | * * |
||
372 | * Only concern here is to make sure that the info is only * |
||
373 | * copied to the parent if our score is > than the parent * |
||
374 | * value, and that we were not stopped for any reason * |
||
375 | * which could produce a partial score that is worthless. * |
||
376 | * * |
||
377 | * In any case, we add our statistical counters to the * |
||
378 | * parent's totals no matter whether we finished or not * |
||
379 | * since the total nodes searched and such should consider * |
||
380 | * everything searched, not just the "useful stuff." * |
||
381 | * * |
||
382 | ************************************************************ |
||
383 | */ |
||
384 | if (child->nodes_searched && !child->stop && value > parent->value && |
||
385 | !abort_search) { |
||
386 | parent->pv[ply] = child->pv[ply]; |
||
387 | parent->value = value; |
||
388 | parent->cutmove = child->curmv[ply]; |
||
389 | } |
||
390 | parent->nodes_searched += child->nodes_searched; |
||
391 | parent->fail_highs += child->fail_highs; |
||
392 | parent->fail_high_first_move += child->fail_high_first_move; |
||
393 | parent->evaluations += child->evaluations; |
||
394 | parent->egtb_probes += child->egtb_probes; |
||
395 | parent->egtb_probes_successful += child->egtb_probes_successful; |
||
396 | parent->extensions_done += child->extensions_done; |
||
397 | parent->qchecks_done += child->qchecks_done; |
||
398 | parent->reductions_done += child->reductions_done; |
||
399 | parent->moves_fpruned += child->moves_fpruned; |
||
400 | child->used = 0; |
||
401 | } |
||
402 | |||
403 | /* |
||
404 | ******************************************************************************* |
||
405 | * * |
||
406 | * WaitForAllThreadsInitialized() waits until all smp_max_threads are * |
||
407 | * initialized. We have to initialize each thread and malloc() its split * |
||
408 | * blocks before we start the actual parallel search. Otherwise we will see * |
||
409 | * invalid memory accesses and crash instantly. * |
||
410 | * * |
||
411 | ******************************************************************************* |
||
412 | */ |
||
413 | void WaitForAllThreadsInitialized(void) { |
||
414 | while (initialized_threads < smp_max_threads); /* Do nothing */ |
||
415 | } |
||
416 | |||
417 | /* modified 06/07/09 */ |
||
418 | /* |
||
419 | ******************************************************************************* |
||
420 | * * |
||
421 | * ThreadInit() is called after a process is created. Its main task is to * |
||
422 | * initialize the process local memory so that it will fault in and be * |
||
423 | * allocated on the local node rather than the node where the original * |
||
424 | * (first) process was running. All threads will hang here via a custom * |
||
425 | * WaitForALlThreadsInitialized() procedure so that all the local thread * |
||
426 | * blocks are usable before the search actually begins. * |
||
427 | * * |
||
428 | ******************************************************************************* |
||
429 | */ |
||
430 | void *STDCALL ThreadInit(void *tid) { |
||
431 | int i, j; |
||
432 | #if defined(AFFINITY) |
||
433 | int64_t k; |
||
434 | cpu_set_t cpuset; |
||
435 | pthread_t current_thread = pthread_self(); |
||
436 | |||
437 | k = (int64_t) tid; |
||
438 | CPU_ZERO(&cpuset); |
||
439 | CPU_SET(k, &cpuset); |
||
440 | pthread_setaffinity_np(current_thread, sizeof(cpu_set_t), &cpuset); |
||
441 | #endif |
||
442 | #if !defined(UNIX) |
||
443 | ThreadMalloc((uint64_t) tid); |
||
444 | #endif |
||
445 | for (i = 0; i < MAX_BLOCKS_PER_CPU; i++) { |
||
446 | memset((void *) block[(uint64_t) tid * MAX_BLOCKS_PER_CPU + i + 1], 0, |
||
447 | sizeof(TREE)); |
||
448 | block[(uint64_t) tid * MAX_BLOCKS_PER_CPU + i + 1]->used = 0; |
||
449 | block[(uint64_t) tid * MAX_BLOCKS_PER_CPU + i + 1]->parent = NULL; |
||
450 | LockInit(block[(uint64_t) tid * MAX_BLOCKS_PER_CPU + i + 1]->lock); |
||
451 | for (j = 0; j < 64; j++) |
||
452 | block[(uint64_t) tid * MAX_BLOCKS_PER_CPU + i + 1]->cache_n[j] = ~0ull; |
||
453 | } |
||
454 | Lock(lock_smp); |
||
455 | initialized_threads++; |
||
456 | Unlock(lock_smp); |
||
457 | WaitForAllThreadsInitialized(); |
||
458 | ThreadWait((uint64_t) tid, (TREE *) 0); |
||
459 | Lock(lock_smp); |
||
460 | smp_threads--; |
||
461 | Unlock(lock_smp); |
||
462 | return 0; |
||
463 | } |
||
464 | |||
465 | #if !defined (UNIX) |
||
466 | /* modified 01/17/09 */ |
||
467 | /* |
||
468 | ******************************************************************************* |
||
469 | * * |
||
470 | * ThreadMalloc() is called from the ThreadInit() function. It malloc's the * |
||
471 | * split blocks in the local memory for the processor associated with the * |
||
472 | * specific thread that is calling this code. * |
||
473 | * * |
||
474 | ******************************************************************************* |
||
475 | */ |
||
476 | extern void *WinMalloc(size_t, int); |
||
477 | void ThreadMalloc(int64_t tid) { |
||
478 | int i, n = MAX_BLOCKS_PER_CPU; |
||
479 | |||
480 | for (i = MAX_BLOCKS_PER_CPU * (int) (tid) + 1; n; i++, n--) { // Pierre-Marie Baty -- added type cast |
||
481 | if (block[i] == NULL) |
||
482 | block[i] = |
||
483 | (TREE *) ((~(size_t) 127) & (127 + (size_t) WinMalloc(sizeof(TREE) + |
||
484 | 127, (int) tid))); // Pierre-Marie Baty -- added type cast |
||
485 | block[i]->used = 0; |
||
486 | block[i]->parent = NULL; |
||
487 | LockInit(block[i]->lock); |
||
488 | } |
||
489 | } |
||
490 | #endif |
||
491 | |||
492 | /* modified 01/17/09 */ |
||
493 | /* |
||
494 | ******************************************************************************* |
||
495 | * * |
||
496 | * ThreadStop() is called from SearchParallel() when it detects a beta * |
||
497 | * cutoff (fail high) at a node that is being searched in parallel. We need * |
||
498 | * to stop all threads here, and since this threading algorithm is recursive * |
||
499 | * it may be necessary to stop other threads that are helping search this * |
||
500 | * branch further down into the tree. This function simply sets tree->stop * |
||
501 | * to 1, which will stop that particular thread instantly and return it to * |
||
502 | * the idle loop in ThreadWait(). * |
||
503 | * * |
||
504 | ******************************************************************************* |
||
505 | */ |
||
506 | void ThreadStop(TREE * RESTRICT tree) { |
||
507 | int proc; |
||
508 | |||
509 | Lock(tree->lock); |
||
510 | tree->stop = 1; |
||
511 | for (proc = 0; proc < smp_max_threads; proc++) |
||
512 | if (tree->siblings[proc]) |
||
513 | ThreadStop(tree->siblings[proc]); |
||
514 | Unlock(tree->lock); |
||
515 | } |
||
516 | |||
517 | /* modified 04/23/14 */ |
||
518 | /* |
||
519 | ******************************************************************************* |
||
520 | * * |
||
521 | * ThreadWait() is the idle loop for the N threads that are created at the * |
||
522 | * beginning when Crafty searches. Threads are "parked" here waiting on a * |
||
523 | * pointer to something they should search (a parameter block built in the * |
||
524 | * function Thread() in this case. When this pointer becomes non-zero, each * |
||
525 | * thread "parked" here will immediately call SearchParallel() and begin the * |
||
526 | * parallel search as directed. * |
||
527 | * * |
||
528 | ******************************************************************************* |
||
529 | */ |
||
530 | int ThreadWait(int64_t tid, TREE * RESTRICT waiting) { |
||
531 | int value, tstart, tend; |
||
532 | |||
533 | /* |
||
534 | ************************************************************ |
||
535 | * * |
||
536 | * The N initial threads enter here and are kept penned * |
||
537 | * here forever. However, once a thread leaves here it * |
||
538 | * may well re-enter ThreadWait() from the top while it * |
||
539 | * waits for a parallel search to complete. While it * |
||
540 | * waits here it can also join in to help other busy * |
||
541 | * threads search their subtrees as well. * |
||
542 | * * |
||
543 | ************************************************************ |
||
544 | */ |
||
545 | while (1) { |
||
546 | tstart = ReadClock(); |
||
547 | Lock(lock_split); |
||
548 | Lock(lock_smp); |
||
549 | smp_idle++; |
||
550 | if (!waiting && smp_split == 0) |
||
551 | smp_split = 1; |
||
552 | if (!thread[tid].tree) |
||
553 | thread[tid].idle = 1; |
||
554 | Unlock(lock_smp); |
||
555 | Unlock(lock_split); |
||
556 | /* |
||
557 | ************************************************************ |
||
558 | * * |
||
559 | * We can exit if our thread[i] is non-zero, or if we are * |
||
560 | * waiting on others to finish a block that *we* have to * |
||
561 | * return through. When the busy count on such a block * |
||
562 | * hits zero, we return immediately which unwinds the * |
||
563 | * search as it should be. * |
||
564 | * * |
||
565 | ************************************************************ |
||
566 | */ |
||
567 | while (!thread[tid].tree && (!waiting || waiting->nprocs)) |
||
568 | Pause(); |
||
569 | thread[tid].idle = 0; |
||
570 | tend = ReadClock(); |
||
571 | Lock(lock_smp); |
||
572 | idle_time += tend - tstart; |
||
573 | if (!thread[tid].tree) |
||
574 | thread[tid].tree = waiting; |
||
575 | /* |
||
576 | ************************************************************ |
||
577 | * * |
||
578 | * We either have work to do, or threads we were waiting * |
||
579 | * on have finished their work. * |
||
580 | * * |
||
581 | ************************************************************ |
||
582 | */ |
||
583 | smp_idle--; |
||
584 | Unlock(lock_smp); |
||
585 | /* |
||
586 | ************************************************************ |
||
587 | * * |
||
588 | * If we are waiting on a block and the busy count is now * |
||
589 | * zero, we simply return to finish up the bookkeeping at * |
||
590 | * that point. * |
||
591 | * * |
||
592 | ************************************************************ |
||
593 | */ |
||
594 | if (thread[tid].tree == waiting || thread[tid].tree == (TREE *) - 1) |
||
595 | return 0; |
||
596 | /* |
||
597 | ************************************************************ |
||
598 | * * |
||
599 | * Else our thread[i] pointer is non-zero, meaning we have * |
||
600 | * been assigned something to search. Hi-ho, hi-ho, it's * |
||
601 | * off to work we go... * |
||
602 | * * |
||
603 | ************************************************************ |
||
604 | */ |
||
605 | value = |
||
606 | SearchParallel(thread[tid].tree, thread[tid].tree->alpha, |
||
607 | thread[tid].tree->beta, thread[tid].tree->value, |
||
608 | thread[tid].tree->side, thread[tid].tree->depth, |
||
609 | thread[tid].tree->ply); |
||
610 | Lock(thread[tid].tree->parent->lock); |
||
611 | CopyToParent((TREE *) thread[tid].tree->parent, thread[tid].tree, value); |
||
612 | thread[tid].tree->parent->nprocs--; |
||
613 | thread[tid].tree->parent->siblings[tid] = 0; |
||
614 | Unlock(thread[tid].tree->parent->lock); |
||
615 | thread[tid].tree = 0; |
||
616 | } |
||
617 | } |