dhcpd-pools  3.0
ISC dhcpd lease usage analyser
str-two-way.h
Go to the documentation of this file.
1 /* Byte-wise substring search, using the Two-Way algorithm.
2  Copyright (C) 2008-2017 Free Software Foundation, Inc.
3  This file is part of the GNU C Library.
4  Written by Eric Blake <ebb9@byu.net>, 2008.
5 
6  This program is free software; you can redistribute it and/or modify
7  it under the terms of the GNU General Public License as published by
8  the Free Software Foundation; either version 3, or (at your option)
9  any later version.
10 
11  This program is distributed in the hope that it will be useful,
12  but WITHOUT ANY WARRANTY; without even the implied warranty of
13  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  GNU General Public License for more details.
15 
16  You should have received a copy of the GNU General Public License along
17  with this program; if not, see <https://www.gnu.org/licenses/>. */
18 
19 /* Before including this file, you need to include <config.h> and
20  <string.h>, and define:
21  RESULT_TYPE A macro that expands to the return type.
22  AVAILABLE(h, h_l, j, n_l)
23  A macro that returns nonzero if there are
24  at least N_L bytes left starting at H[J].
25  H is 'unsigned char *', H_L, J, and N_L
26  are 'size_t'; H_L is an lvalue. For
27  NUL-terminated searches, H_L can be
28  modified each iteration to avoid having
29  to compute the end of H up front.
30 
31  For case-insensitivity, you may optionally define:
32  CMP_FUNC(p1, p2, l) A macro that returns 0 iff the first L
33  characters of P1 and P2 are equal.
34  CANON_ELEMENT(c) A macro that canonicalizes an element right after
35  it has been fetched from one of the two strings.
36  The argument is an 'unsigned char'; the result
37  must be an 'unsigned char' as well.
38 
39  This file undefines the macros documented above, and defines
40  LONG_NEEDLE_THRESHOLD.
41 */
42 
43 #include <limits.h>
44 #include <stdint.h>
45 
46 /* We use the Two-Way string matching algorithm (also known as
47  Chrochemore-Perrin), which guarantees linear complexity with
48  constant space. Additionally, for long needles, we also use a bad
49  character shift table similar to the Boyer-Moore algorithm to
50  achieve improved (potentially sub-linear) performance.
51 
52  See http://www-igm.univ-mlv.fr/~lecroq/string/node26.html#SECTION00260,
53  https://en.wikipedia.org/wiki/Boyer-Moore_string_search_algorithm,
54  https://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.34.6641&rep=rep1&type=pdf
55 */
56 
57 /* Point at which computing a bad-byte shift table is likely to be
58  worthwhile. Small needles should not compute a table, since it
59  adds (1 << CHAR_BIT) + NEEDLE_LEN computations of preparation for a
60  speedup no greater than a factor of NEEDLE_LEN. The larger the
61  needle, the better the potential performance gain. On the other
62  hand, on non-POSIX systems with CHAR_BIT larger than eight, the
63  memory required for the table is prohibitive. */
64 #if CHAR_BIT < 10
65 # define LONG_NEEDLE_THRESHOLD 32U
66 #else
67 # define LONG_NEEDLE_THRESHOLD SIZE_MAX
68 #endif
69 
70 #ifndef MAX
71 # define MAX(a, b) ((a < b) ? (b) : (a))
72 #endif
73 
74 #ifndef CANON_ELEMENT
75 # define CANON_ELEMENT(c) c
76 #endif
77 #ifndef CMP_FUNC
78 # define CMP_FUNC memcmp
79 #endif
80 
81 /* Perform a critical factorization of NEEDLE, of length NEEDLE_LEN.
82  Return the index of the first byte in the right half, and set
83  *PERIOD to the global period of the right half.
84 
85  The global period of a string is the smallest index (possibly its
86  length) at which all remaining bytes in the string are repetitions
87  of the prefix (the last repetition may be a subset of the prefix).
88 
89  When NEEDLE is factored into two halves, a local period is the
90  length of the smallest word that shares a suffix with the left half
91  and shares a prefix with the right half. All factorizations of a
92  non-empty NEEDLE have a local period of at least 1 and no greater
93  than NEEDLE_LEN.
94 
95  A critical factorization has the property that the local period
96  equals the global period. All strings have at least one critical
97  factorization with the left half smaller than the global period.
98  And while some strings have more than one critical factorization,
99  it is provable that with an ordered alphabet, at least one of the
100  critical factorizations corresponds to a maximal suffix.
101 
102  Given an ordered alphabet, a critical factorization can be computed
103  in linear time, with 2 * NEEDLE_LEN comparisons, by computing the
104  shorter of two ordered maximal suffixes. The ordered maximal
105  suffixes are determined by lexicographic comparison while tracking
106  periodicity. */
107 static size_t
108 critical_factorization (const unsigned char *needle, size_t needle_len,
109  size_t *period)
110 {
111  /* Index of last byte of left half, or SIZE_MAX. */
112  size_t max_suffix, max_suffix_rev;
113  size_t j; /* Index into NEEDLE for current candidate suffix. */
114  size_t k; /* Offset into current period. */
115  size_t p; /* Intermediate period. */
116  unsigned char a, b; /* Current comparison bytes. */
117 
118  /* Special case NEEDLE_LEN of 1 or 2 (all callers already filtered
119  out 0-length needles. */
120  if (needle_len < 3)
121  {
122  *period = 1;
123  return needle_len - 1;
124  }
125 
126  /* Invariants:
127  0 <= j < NEEDLE_LEN - 1
128  -1 <= max_suffix{,_rev} < j (treating SIZE_MAX as if it were signed)
129  min(max_suffix, max_suffix_rev) < global period of NEEDLE
130  1 <= p <= global period of NEEDLE
131  p == global period of the substring NEEDLE[max_suffix{,_rev}+1...j]
132  1 <= k <= p
133  */
134 
135  /* Perform lexicographic search. */
136  max_suffix = SIZE_MAX;
137  j = 0;
138  k = p = 1;
139  while (j + k < needle_len)
140  {
141  a = CANON_ELEMENT (needle[j + k]);
142  b = CANON_ELEMENT (needle[max_suffix + k]);
143  if (a < b)
144  {
145  /* Suffix is smaller, period is entire prefix so far. */
146  j += k;
147  k = 1;
148  p = j - max_suffix;
149  }
150  else if (a == b)
151  {
152  /* Advance through repetition of the current period. */
153  if (k != p)
154  ++k;
155  else
156  {
157  j += p;
158  k = 1;
159  }
160  }
161  else /* b < a */
162  {
163  /* Suffix is larger, start over from current location. */
164  max_suffix = j++;
165  k = p = 1;
166  }
167  }
168  *period = p;
169 
170  /* Perform reverse lexicographic search. */
171  max_suffix_rev = SIZE_MAX;
172  j = 0;
173  k = p = 1;
174  while (j + k < needle_len)
175  {
176  a = CANON_ELEMENT (needle[j + k]);
177  b = CANON_ELEMENT (needle[max_suffix_rev + k]);
178  if (b < a)
179  {
180  /* Suffix is smaller, period is entire prefix so far. */
181  j += k;
182  k = 1;
183  p = j - max_suffix_rev;
184  }
185  else if (a == b)
186  {
187  /* Advance through repetition of the current period. */
188  if (k != p)
189  ++k;
190  else
191  {
192  j += p;
193  k = 1;
194  }
195  }
196  else /* a < b */
197  {
198  /* Suffix is larger, start over from current location. */
199  max_suffix_rev = j++;
200  k = p = 1;
201  }
202  }
203 
204  /* Choose the shorter suffix. Return the index of the first byte of
205  the right half, rather than the last byte of the left half.
206 
207  For some examples, 'banana' has two critical factorizations, both
208  exposed by the two lexicographic extreme suffixes of 'anana' and
209  'nana', where both suffixes have a period of 2. On the other
210  hand, with 'aab' and 'bba', both strings have a single critical
211  factorization of the last byte, with the suffix having a period
212  of 1. While the maximal lexicographic suffix of 'aab' is 'b',
213  the maximal lexicographic suffix of 'bba' is 'ba', which is not a
214  critical factorization. Conversely, the maximal reverse
215  lexicographic suffix of 'a' works for 'bba', but not 'ab' for
216  'aab'. The shorter suffix of the two will always be a critical
217  factorization. */
218  if (max_suffix_rev + 1 < max_suffix + 1)
219  return max_suffix + 1;
220  *period = p;
221  return max_suffix_rev + 1;
222 }
223 
224 /* Return the first location of non-empty NEEDLE within HAYSTACK, or
225  NULL. HAYSTACK_LEN is the minimum known length of HAYSTACK. This
226  method is optimized for NEEDLE_LEN < LONG_NEEDLE_THRESHOLD.
227  Performance is guaranteed to be linear, with an initialization cost
228  of 2 * NEEDLE_LEN comparisons.
229 
230  If AVAILABLE does not modify HAYSTACK_LEN (as in memmem), then at
231  most 2 * HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching.
232  If AVAILABLE modifies HAYSTACK_LEN (as in strstr), then at most 3 *
233  HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching. */
234 static RETURN_TYPE
235 two_way_short_needle (const unsigned char *haystack, size_t haystack_len,
236  const unsigned char *needle, size_t needle_len)
237 {
238  size_t i; /* Index into current byte of NEEDLE. */
239  size_t j; /* Index into current window of HAYSTACK. */
240  size_t period; /* The period of the right half of needle. */
241  size_t suffix; /* The index of the right half of needle. */
242 
243  /* Factor the needle into two halves, such that the left half is
244  smaller than the global period, and the right half is
245  periodic (with a period as large as NEEDLE_LEN - suffix). */
246  suffix = critical_factorization (needle, needle_len, &period);
247 
248  /* Perform the search. Each iteration compares the right half
249  first. */
250  if (CMP_FUNC (needle, needle + period, suffix) == 0)
251  {
252  /* Entire needle is periodic; a mismatch in the left half can
253  only advance by the period, so use memory to avoid rescanning
254  known occurrences of the period in the right half. */
255  size_t memory = 0;
256  j = 0;
257  while (AVAILABLE (haystack, haystack_len, j, needle_len))
258  {
259  /* Scan for matches in right half. */
260  i = MAX (suffix, memory);
261  while (i < needle_len && (CANON_ELEMENT (needle[i])
262  == CANON_ELEMENT (haystack[i + j])))
263  ++i;
264  if (needle_len <= i)
265  {
266  /* Scan for matches in left half. */
267  i = suffix - 1;
268  while (memory < i + 1 && (CANON_ELEMENT (needle[i])
269  == CANON_ELEMENT (haystack[i + j])))
270  --i;
271  if (i + 1 < memory + 1)
272  return (RETURN_TYPE) (haystack + j);
273  /* No match, so remember how many repetitions of period
274  on the right half were scanned. */
275  j += period;
276  memory = needle_len - period;
277  }
278  else
279  {
280  j += i - suffix + 1;
281  memory = 0;
282  }
283  }
284  }
285  else
286  {
287  /* The two halves of needle are distinct; no extra memory is
288  required, and any mismatch results in a maximal shift. */
289  period = MAX (suffix, needle_len - suffix) + 1;
290  j = 0;
291  while (AVAILABLE (haystack, haystack_len, j, needle_len))
292  {
293  /* Scan for matches in right half. */
294  i = suffix;
295  while (i < needle_len && (CANON_ELEMENT (needle[i])
296  == CANON_ELEMENT (haystack[i + j])))
297  ++i;
298  if (needle_len <= i)
299  {
300  /* Scan for matches in left half. */
301  i = suffix - 1;
302  while (i != SIZE_MAX && (CANON_ELEMENT (needle[i])
303  == CANON_ELEMENT (haystack[i + j])))
304  --i;
305  if (i == SIZE_MAX)
306  return (RETURN_TYPE) (haystack + j);
307  j += period;
308  }
309  else
310  j += i - suffix + 1;
311  }
312  }
313  return NULL;
314 }
315 
316 /* Return the first location of non-empty NEEDLE within HAYSTACK, or
317  NULL. HAYSTACK_LEN is the minimum known length of HAYSTACK. This
318  method is optimized for LONG_NEEDLE_THRESHOLD <= NEEDLE_LEN.
319  Performance is guaranteed to be linear, with an initialization cost
320  of 3 * NEEDLE_LEN + (1 << CHAR_BIT) operations.
321 
322  If AVAILABLE does not modify HAYSTACK_LEN (as in memmem), then at
323  most 2 * HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching,
324  and sublinear performance O(HAYSTACK_LEN / NEEDLE_LEN) is possible.
325  If AVAILABLE modifies HAYSTACK_LEN (as in strstr), then at most 3 *
326  HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching, and
327  sublinear performance is not possible. */
328 static RETURN_TYPE
329 two_way_long_needle (const unsigned char *haystack, size_t haystack_len,
330  const unsigned char *needle, size_t needle_len)
331 {
332  size_t i; /* Index into current byte of NEEDLE. */
333  size_t j; /* Index into current window of HAYSTACK. */
334  size_t period; /* The period of the right half of needle. */
335  size_t suffix; /* The index of the right half of needle. */
336  size_t shift_table[1U << CHAR_BIT]; /* See below. */
337 
338  /* Factor the needle into two halves, such that the left half is
339  smaller than the global period, and the right half is
340  periodic (with a period as large as NEEDLE_LEN - suffix). */
341  suffix = critical_factorization (needle, needle_len, &period);
342 
343  /* Populate shift_table. For each possible byte value c,
344  shift_table[c] is the distance from the last occurrence of c to
345  the end of NEEDLE, or NEEDLE_LEN if c is absent from the NEEDLE.
346  shift_table[NEEDLE[NEEDLE_LEN - 1]] contains the only 0. */
347  for (i = 0; i < 1U << CHAR_BIT; i++)
348  shift_table[i] = needle_len;
349  for (i = 0; i < needle_len; i++)
350  shift_table[CANON_ELEMENT (needle[i])] = needle_len - i - 1;
351 
352  /* Perform the search. Each iteration compares the right half
353  first. */
354  if (CMP_FUNC (needle, needle + period, suffix) == 0)
355  {
356  /* Entire needle is periodic; a mismatch in the left half can
357  only advance by the period, so use memory to avoid rescanning
358  known occurrences of the period in the right half. */
359  size_t memory = 0;
360  size_t shift;
361  j = 0;
362  while (AVAILABLE (haystack, haystack_len, j, needle_len))
363  {
364  /* Check the last byte first; if it does not match, then
365  shift to the next possible match location. */
366  shift = shift_table[CANON_ELEMENT (haystack[j + needle_len - 1])];
367  if (0 < shift)
368  {
369  if (memory && shift < period)
370  {
371  /* Since needle is periodic, but the last period has
372  a byte out of place, there can be no match until
373  after the mismatch. */
374  shift = needle_len - period;
375  }
376  memory = 0;
377  j += shift;
378  continue;
379  }
380  /* Scan for matches in right half. The last byte has
381  already been matched, by virtue of the shift table. */
382  i = MAX (suffix, memory);
383  while (i < needle_len - 1 && (CANON_ELEMENT (needle[i])
384  == CANON_ELEMENT (haystack[i + j])))
385  ++i;
386  if (needle_len - 1 <= i)
387  {
388  /* Scan for matches in left half. */
389  i = suffix - 1;
390  while (memory < i + 1 && (CANON_ELEMENT (needle[i])
391  == CANON_ELEMENT (haystack[i + j])))
392  --i;
393  if (i + 1 < memory + 1)
394  return (RETURN_TYPE) (haystack + j);
395  /* No match, so remember how many repetitions of period
396  on the right half were scanned. */
397  j += period;
398  memory = needle_len - period;
399  }
400  else
401  {
402  j += i - suffix + 1;
403  memory = 0;
404  }
405  }
406  }
407  else
408  {
409  /* The two halves of needle are distinct; no extra memory is
410  required, and any mismatch results in a maximal shift. */
411  size_t shift;
412  period = MAX (suffix, needle_len - suffix) + 1;
413  j = 0;
414  while (AVAILABLE (haystack, haystack_len, j, needle_len))
415  {
416  /* Check the last byte first; if it does not match, then
417  shift to the next possible match location. */
418  shift = shift_table[CANON_ELEMENT (haystack[j + needle_len - 1])];
419  if (0 < shift)
420  {
421  j += shift;
422  continue;
423  }
424  /* Scan for matches in right half. The last byte has
425  already been matched, by virtue of the shift table. */
426  i = suffix;
427  while (i < needle_len - 1 && (CANON_ELEMENT (needle[i])
428  == CANON_ELEMENT (haystack[i + j])))
429  ++i;
430  if (needle_len - 1 <= i)
431  {
432  /* Scan for matches in left half. */
433  i = suffix - 1;
434  while (i != SIZE_MAX && (CANON_ELEMENT (needle[i])
435  == CANON_ELEMENT (haystack[i + j])))
436  --i;
437  if (i == SIZE_MAX)
438  return (RETURN_TYPE) (haystack + j);
439  j += period;
440  }
441  else
442  j += i - suffix + 1;
443  }
444  }
445  return NULL;
446 }
447 
448 #undef AVAILABLE
449 #undef CANON_ELEMENT
450 #undef CMP_FUNC
451 #undef MAX
452 #undef RETURN_TYPE
#define SIZE_MAX
Definition: quotearg.c:52
static size_t critical_factorization(const unsigned char *needle, size_t needle_len, size_t *period)
Definition: str-two-way.h:108
#define RETURN_TYPE
Definition: strstr.c:29
#define AVAILABLE(h, h_l, j, n_l)
Definition: strstr.c:30
static RETURN_TYPE two_way_long_needle(const unsigned char *haystack, size_t haystack_len, const unsigned char *needle, size_t needle_len)
Definition: str-two-way.h:329
#define CANON_ELEMENT(c)
Definition: str-two-way.h:75
#define CMP_FUNC
Definition: str-two-way.h:78
#define MAX(a, b)
Definition: str-two-way.h:71
static RETURN_TYPE two_way_short_needle(const unsigned char *haystack, size_t haystack_len, const unsigned char *needle, size_t needle_len)
Definition: str-two-way.h:235
#define NULL
Definition: stddef.in.h:72