def search_boyer_moore(text, pattern):
len_text = len(text)
len_pattern = len(pattern)
result_list = []
t=0
is_match = True
while t <= len_text - len_pattern:
print(f'DEBUG: --- {text}')
print(f' --- {" "*t}{pattern}')
for p in range(len_pattern-1, -1, -1): # pattern analyzed in reverse mode
if pattern[p] != text[t+p]: # mismatch! find bad and good shift
is_match = False
print(f'DEBUG: MISMATCH index {t+p}/{p}: text - {text[t+p]} pattern - {pattern[p]}')
# GOOD SHIFT CALCULATION
if p == len_pattern - 1: # it is end of pattern - no good shift
good_shift = 0
print(f' DEBUG: mismatch on last character - good_shift proposed = {good_shift}')
else: # search for substrings matching to already processed suffix
k=1 # note: it is reverse pattern processing, adding k makes suffix shorter
while text[t+p+k:t+len_pattern] not in pattern[0:len_pattern-1]:
k=k+1
if len(text[t+p+k:t+len_pattern]) != 1: # good-suffix found and is longer than 1 - use it!
good_shift=t+p+k-pattern[0:len_pattern-1].rfind(text[t+p+k:t+len_pattern])
print(f' DEBUG: good suffix has length > 1 - good_shift proposed = {good_shift}')
else:
good_shift=0 # good-suffix found, but has length of 1 - don't use it, bad shift will be better
print(f' DEBUG: good suffix has length of 1 - good_shift proposed = {good_shift}')
# BAD SHIFT CALCULATION
if text[t+p] in pattern[0:p]: # if bad character exists in the pattern
bad_shift=t+p-pattern[0:p].rfind(text[t+p]) # move to this character, so it matches with the pattern and text
print(f' DEBUG: bad character {text[t+p]} found - bad_shift proposed = {bad_shift}')
else:
bad_shift=t+p+1 # if bad character not found - move analysis after the current char
print(f' DEBUG: bad character not found - bad_shift proposed = {bad_shift}')
t=max((bad_shift, good_shift))
print(f' DEBUG: selected shift to {t}')
break
if is_match:
result_list.append(t)
print(f'DEBUG: Match found on index {t}')
t = t+1
else:
is_match = True
return result_list
def generate_prefix(pattern):
pat_len = len(pattern)
pat_fun = [0] * pat_len
pref_len = 0 # length of the previous longest prefix suffix
if pat_len <= 1: # eliminate trivial conditions
return pat_fun
pat_fun[0] = 0 # always 0
i = 1
while i < pat_len:
if pattern[i] == pattern[pref_len]:
pref_len += 1
pat_fun[i] = pref_len
i += 1
else:
# This is tricky. The pattern may contain multiple sub-prefixes. This allows to use them
if pref_len != 0:
pref_len = pat_fun[pref_len-1]
# we do not increment i here
# the loop will execute again for the same i and pref_len
# until a matching prefix would be found or we step down to pref_len == 0
else:
pat_fun[i] = 0
i += 1
return pat_fun
def search_kmp(pattern, text):
pat_len = len(pattern)
txt_len = len(text)
pat_fun = generate_prefix(pattern)
results_list = []
p = 0 # index pointing to pattern
t = 0 # index pointing to text
while t < txt_len:
if pattern[p] == text[t]:
# print('DEBUG: ', pattern[:p+1],'matches',text[:t+1])
t += 1
p += 1
if p == pat_len:
# print ('DEBUG: ', "Found pattern at index", t-p)
results_list.append(t-p)
p = pat_fun[p-1]
# print('DEBUG: ', 'p comes back to', p)
else:
if p != 0:
# print('DEBUG: ', 'Mismatch!', pattern[p], '<>', text[t], ' in ', pattern[:p+1], 'and', text[:t+1])
p = pat_fun[p-1]
# print('DEBUG: ', 't reamains', t, 'p comes back to', p, 'we will compare ', pattern[:p+1], 'with text', text[:t+1])
else:
# print('DEBUG: ', 'Mismatch! Return to start!', pattern[p], '<>', text[t])
t += 1
# print('DEBUG: ', 't increase to', t, 'p remains', p, 'we will compare', pattern[:p+1], 'with text', text[:t+1])
return results_list
def search_naive(pattern, text):
result_list = []
for text_start in range(len(text) - len(pattern) + 1):
found = True
for i in range(len(pattern)):
if pattern[i] != text[text_start+i]:
found = False
break
if found:
result_list.append(text_start)
return result_list
def generate_hash(text, pattern):
len_text = len(text)
len_pattern = len(pattern)
# lists with ordinals for charactes from the text and pattern
ascii_text = [ord(i) for i in text]
ascii_pattern = [ord(i) for i in pattern]
# calculate hash for pattern
hash_pattern = sum(ascii_pattern)
# calculate hash for text
len_hash_text = len_text - len_pattern + 1
hash_text = [0] * len_hash_text
for i in range(0, len_hash_text):
if i == 0:
hash_text[i] = sum(ascii_text[:len_pattern])
else:
hash_text[i] = (hash_text[i-1] - ascii_text[i-1] + ascii_text[i+len_pattern-1])
return [hash_text, hash_pattern]
def search_rabin_karp(pattern,text):
hash_text, hash_pattern = generate_hash(text, pattern) # hash_text has length of: len_text - len_pattern + 1
len_text = len(text)
len_pattern = len(pattern)
result_list = []
for i in range(len(hash_text)):
if hash_text[i] == hash_pattern: # it is possible that we have a match
matched_chars_count = 0
for j in range(len_pattern):
if pattern[j] == text[i+j]:
matched_chars_count += 1
else:
break # no - it is not a match, aborting!
if matched_chars_count == len_pattern: # when matched_chars_count == len_pattern then the full pattern was in the text
result_list.append(i)
# print(f"DEBUG: Found at index {i}")
return result_list
def search_boyer_moore(pattern,text):
len_text = len(text)
len_pattern = len(pattern)
result_list = []
t=0
is_match = True
while t <= len_text - len_pattern:
#print(f'DEBUG: --- {text}')
#print(f' --- {" "*t}{pattern}')
for p in range(len_pattern-1, -1, -1): # pattern analyzed in reverse mode
if pattern[p] != text[t+p]: # mismatch! find bad and good shift
is_match = False
#print(f'DEBUG: MISMATCH index {t+p}/{p}: text - {text[t+p]} pattern - {pattern[p]}')
# GOOD SHIFT CALCULATION
if p == len_pattern - 1: # it is end of pattern - no good shift
good_shift = 0
#print(f' DEBUG: mismatch on last character - good_shift proposed = {good_shift}')
else: # search for substrings matching to already processed suffix
k=1 # note: it is reverse pattern processing, adding k makes suffix shorter
while text[t+p+k:t+len_pattern] not in pattern[0:len_pattern-1]:
k=k+1
if len(text[t+p+k:t+len_pattern]) != 1: # good-suffix found and is longer than 1 - use it!
good_shift=t+p+k-pattern[0:len_pattern-1].rfind(text[t+p+k:t+len_pattern])
#print(f' DEBUG: good suffix has length > 1 - good_shift proposed = {good_shift}')
else:
good_shift=0 # good-suffix found, but has length of 1 - don't use it, bad shift will be better
#print(f' DEBUG: good suffix has length of 1 - good_shift proposed = {good_shift}')
# BAD SHIFT CALCULATION
if text[t+p] in pattern[0:p]: # if bad character exists in the pattern
bad_shift=t+p-pattern[0:p].rfind(text[t+p]) # move to this character, so it matches with the pattern and text
#print(f' DEBUG: bad character {text[t+p]} found - bad_shift proposed = {bad_shift}')
else:
bad_shift=t+p+1 # if bad character not found - move analysis after the current char
#print(f' DEBUG: bad character not found - bad_shift proposed = {bad_shift}')
t=max((bad_shift, good_shift))
#print(f' DEBUG: selected shift to {t}')
break
if is_match:
result_list.append(t)
#print(f'DEBUG: Match found on index {t}')
t = t+1
else:
is_match = True
return result_list
import random
import time
max_len = 1000000
#allowed_chars = ['a','b']
allowed_chars = ['a','b',' ']
print('Generating random text')
start = time.time()
text = ''.join(random.choice(allowed_chars) for i in range(max_len))
stop = time.time()
print(f'Text generation: {stop - start}')
pattern = 'ababababababab'
print('Searching')
# checking time - naive search
start = time.time()
results_naive = search_naive(pattern, text)
stop = time.time()
print(f'Naive search: found {len(results_naive)} matches in {stop - start} seconds')
# checking time - KMP search
start = time.time()
results_kmp = search_kmp(pattern, text)
stop = time.time()
print(f'KMP search: found {len(results_kmp)} matches in {stop - start} seconds')
# checking time - Rabin-Karp search
start = time.time()
results_rk = search_rabin_karp(pattern, text)
stop = time.time()
print(f'Rabin-Karp search: found {len(results_rk)} matches in {stop - start} seconds')
# checking time - Boyer-Moore search
start = time.time()
results_bm = search_boyer_moore(pattern, text)
stop = time.time()
print(f'Boyer-Moore search: found {len(results_bm)} matches in {stop - start} seconds')