@@ -99,219 +99,3 @@ int sha1_pos(const unsigned char *sha1, void *table, size_t nr,
9999 } while (lo < hi );
100100 return - lo - 1 ;
101101}
102-
103- /*
104- * Conventional binary search loop looks like this:
105- *
106- * unsigned lo, hi;
107- * do {
108- * unsigned mi = (lo + hi) / 2;
109- * int cmp = "entry pointed at by mi" minus "target";
110- * if (!cmp)
111- * return (mi is the wanted one)
112- * if (cmp > 0)
113- * hi = mi; "mi is larger than target"
114- * else
115- * lo = mi+1; "mi is smaller than target"
116- * } while (lo < hi);
117- *
118- * The invariants are:
119- *
120- * - When entering the loop, lo points at a slot that is never
121- * above the target (it could be at the target), hi points at a
122- * slot that is guaranteed to be above the target (it can never
123- * be at the target).
124- *
125- * - We find a point 'mi' between lo and hi (mi could be the same
126- * as lo, but never can be as same as hi), and check if it hits
127- * the target. There are three cases:
128- *
129- * - if it is a hit, we are happy.
130- *
131- * - if it is strictly higher than the target, we set it to hi,
132- * and repeat the search.
133- *
134- * - if it is strictly lower than the target, we update lo to
135- * one slot after it, because we allow lo to be at the target.
136- *
137- * If the loop exits, there is no matching entry.
138- *
139- * When choosing 'mi', we do not have to take the "middle" but
140- * anywhere in between lo and hi, as long as lo <= mi < hi is
141- * satisfied. When we somehow know that the distance between the
142- * target and lo is much shorter than the target and hi, we could
143- * pick mi that is much closer to lo than the midway.
144- *
145- * Now, we can take advantage of the fact that SHA-1 is a good hash
146- * function, and as long as there are enough entries in the table, we
147- * can expect uniform distribution. An entry that begins with for
148- * example "deadbeef..." is much likely to appear much later than in
149- * the midway of the table. It can reasonably be expected to be near
150- * 87% (222/256) from the top of the table.
151- *
152- * However, we do not want to pick "mi" too precisely. If the entry at
153- * the 87% in the above example turns out to be higher than the target
154- * we are looking for, we would end up narrowing the search space down
155- * only by 13%, instead of 50% we would get if we did a simple binary
156- * search. So we would want to hedge our bets by being less aggressive.
157- *
158- * The table at "table" holds at least "nr" entries of "elem_size"
159- * bytes each. Each entry has the SHA-1 key at "key_offset". The
160- * table is sorted by the SHA-1 key of the entries. The caller wants
161- * to find the entry with "key", and knows that the entry at "lo" is
162- * not higher than the entry it is looking for, and that the entry at
163- * "hi" is higher than the entry it is looking for.
164- */
165- int sha1_entry_pos (const void * table ,
166- size_t elem_size ,
167- size_t key_offset ,
168- unsigned lo , unsigned hi , unsigned nr ,
169- const unsigned char * key )
170- {
171- const unsigned char * base = table ;
172- const unsigned char * hi_key , * lo_key ;
173- unsigned ofs_0 ;
174- static int debug_lookup = -1 ;
175-
176- if (debug_lookup < 0 )
177- debug_lookup = !!getenv ("GIT_DEBUG_LOOKUP" );
178-
179- if (!nr || lo >= hi )
180- return -1 ;
181-
182- if (nr == hi )
183- hi_key = NULL ;
184- else
185- hi_key = base + elem_size * hi + key_offset ;
186- lo_key = base + elem_size * lo + key_offset ;
187-
188- ofs_0 = 0 ;
189- do {
190- int cmp ;
191- unsigned ofs , mi , range ;
192- unsigned lov , hiv , kyv ;
193- const unsigned char * mi_key ;
194-
195- range = hi - lo ;
196- if (hi_key ) {
197- for (ofs = ofs_0 ; ofs < 20 ; ofs ++ )
198- if (lo_key [ofs ] != hi_key [ofs ])
199- break ;
200- ofs_0 = ofs ;
201- /*
202- * byte 0 thru (ofs-1) are the same between
203- * lo and hi; ofs is the first byte that is
204- * different.
205- *
206- * If ofs==20, then no bytes are different,
207- * meaning we have entries with duplicate
208- * keys. We know that we are in a solid run
209- * of this entry (because the entries are
210- * sorted, and our lo and hi are the same,
211- * there can be nothing but this single key
212- * in between). So we can stop the search.
213- * Either one of these entries is it (and
214- * we do not care which), or we do not have
215- * it.
216- *
217- * Furthermore, we know that one of our
218- * endpoints must be the edge of the run of
219- * duplicates. For example, given this
220- * sequence:
221- *
222- * idx 0 1 2 3 4 5
223- * key A C C C C D
224- *
225- * If we are searching for "B", we might
226- * hit the duplicate run at lo=1, hi=3
227- * (e.g., by first mi=3, then mi=0). But we
228- * can never have lo > 1, because B < C.
229- * That is, if our key is less than the
230- * run, we know that "lo" is the edge, but
231- * we can say nothing of "hi". Similarly,
232- * if our key is greater than the run, we
233- * know that "hi" is the edge, but we can
234- * say nothing of "lo".
235- *
236- * Therefore if we do not find it, we also
237- * know where it would go if it did exist:
238- * just on the far side of the edge that we
239- * know about.
240- */
241- if (ofs == 20 ) {
242- mi = lo ;
243- mi_key = base + elem_size * mi + key_offset ;
244- cmp = memcmp (mi_key , key , 20 );
245- if (!cmp )
246- return mi ;
247- if (cmp < 0 )
248- return -1 - hi ;
249- else
250- return -1 - lo ;
251- }
252-
253- hiv = hi_key [ofs_0 ];
254- if (ofs_0 < 19 )
255- hiv = (hiv << 8 ) | hi_key [ofs_0 + 1 ];
256- } else {
257- hiv = 256 ;
258- if (ofs_0 < 19 )
259- hiv <<= 8 ;
260- }
261- lov = lo_key [ofs_0 ];
262- kyv = key [ofs_0 ];
263- if (ofs_0 < 19 ) {
264- lov = (lov << 8 ) | lo_key [ofs_0 + 1 ];
265- kyv = (kyv << 8 ) | key [ofs_0 + 1 ];
266- }
267- assert (lov < hiv );
268-
269- if (kyv < lov )
270- return -1 - lo ;
271- if (hiv < kyv )
272- return -1 - hi ;
273-
274- /*
275- * Even if we know the target is much closer to 'hi'
276- * than 'lo', if we pick too precisely and overshoot
277- * (e.g. when we know 'mi' is closer to 'hi' than to
278- * 'lo', pick 'mi' that is higher than the target), we
279- * end up narrowing the search space by a smaller
280- * amount (i.e. the distance between 'mi' and 'hi')
281- * than what we would have (i.e. about half of 'lo'
282- * and 'hi'). Hedge our bets to pick 'mi' less
283- * aggressively, i.e. make 'mi' a bit closer to the
284- * middle than we would otherwise pick.
285- */
286- kyv = (kyv * 6 + lov + hiv ) / 8 ;
287- if (lov < hiv - 1 ) {
288- if (kyv == lov )
289- kyv ++ ;
290- else if (kyv == hiv )
291- kyv -- ;
292- }
293- mi = (range - 1 ) * (kyv - lov ) / (hiv - lov ) + lo ;
294-
295- if (debug_lookup ) {
296- printf ("lo %u hi %u rg %u mi %u " , lo , hi , range , mi );
297- printf ("ofs %u lov %x, hiv %x, kyv %x\n" ,
298- ofs_0 , lov , hiv , kyv );
299- }
300- if (!(lo <= mi && mi < hi ))
301- die ("assertion failure lo %u mi %u hi %u %s" ,
302- lo , mi , hi , sha1_to_hex (key ));
303-
304- mi_key = base + elem_size * mi + key_offset ;
305- cmp = memcmp (mi_key + ofs_0 , key + ofs_0 , 20 - ofs_0 );
306- if (!cmp )
307- return mi ;
308- if (cmp > 0 ) {
309- hi = mi ;
310- hi_key = mi_key ;
311- } else {
312- lo = mi + 1 ;
313- lo_key = mi_key + elem_size ;
314- }
315- } while (lo < hi );
316- return - lo - 1 ;
317- }
0 commit comments