def search_boyer_moore(text, pattern): len_text = len(text) len_pattern = len(pattern) result_list = [] t=0 is_match = True while t <= len_text - len_pattern: print(f'DEBUG: --- {text}') print(f' --- {" "*t}{pattern}') for p in range(len_pattern-1, -1, -1): # pattern analyzed in reverse mode if pattern[p] != text[t+p]: # mismatch! find bad and good shift is_match = False print(f'DEBUG: MISMATCH index {t+p}/{p}: text - {text[t+p]} pattern - {pattern[p]}') # GOOD SHIFT CALCULATION if p == len_pattern - 1: # it is end of pattern - no good shift good_shift = 0 print(f' DEBUG: mismatch on last character - good_shift proposed = {good_shift}') else: # search for substrings matching to already processed suffix k=1 # note: it is reverse pattern processing, adding k makes suffix shorter while text[t+p+k:t+len_pattern] not in pattern[0:len_pattern-1]: k=k+1 if len(text[t+p+k:t+len_pattern]) != 1: # good-suffix found and is longer than 1 - use it! good_shift=t+p+k-pattern[0:len_pattern-1].rfind(text[t+p+k:t+len_pattern]) print(f' DEBUG: good suffix has length > 1 - good_shift proposed = {good_shift}') else: good_shift=0 # good-suffix found, but has length of 1 - don't use it, bad shift will be better print(f' DEBUG: good suffix has length of 1 - good_shift proposed = {good_shift}') # BAD SHIFT CALCULATION if text[t+p] in pattern[0:p]: # if bad character exists in the pattern bad_shift=t+p-pattern[0:p].rfind(text[t+p]) # move to this character, so it matches with the pattern and text print(f' DEBUG: bad character {text[t+p]} found - bad_shift proposed = {bad_shift}') else: bad_shift=t+p+1 # if bad character not found - move analysis after the current char print(f' DEBUG: bad character not found - bad_shift proposed = {bad_shift}') t=max((bad_shift, good_shift)) print(f' DEBUG: selected shift to {t}') break if is_match: result_list.append(t) print(f'DEBUG: Match found on index {t}') t = t+1 else: is_match = True return result_list
def generate_prefix(pattern): pat_len = len(pattern) pat_fun = [0] * pat_len pref_len = 0 # length of the previous longest prefix suffix if pat_len <= 1: # eliminate trivial conditions return pat_fun pat_fun[0] = 0 # always 0 i = 1 while i < pat_len: if pattern[i] == pattern[pref_len]: pref_len += 1 pat_fun[i] = pref_len i += 1 else: # This is tricky. The pattern may contain multiple sub-prefixes. This allows to use them if pref_len != 0: pref_len = pat_fun[pref_len-1] # we do not increment i here # the loop will execute again for the same i and pref_len # until a matching prefix would be found or we step down to pref_len == 0 else: pat_fun[i] = 0 i += 1 return pat_fun def search_kmp(pattern, text): pat_len = len(pattern) txt_len = len(text) pat_fun = generate_prefix(pattern) results_list = [] p = 0 # index pointing to pattern t = 0 # index pointing to text while t < txt_len: if pattern[p] == text[t]: # print('DEBUG: ', pattern[:p+1],'matches',text[:t+1]) t += 1 p += 1 if p == pat_len: # print ('DEBUG: ', "Found pattern at index", t-p) results_list.append(t-p) p = pat_fun[p-1] # print('DEBUG: ', 'p comes back to', p) else: if p != 0: # print('DEBUG: ', 'Mismatch!', pattern[p], '<>', text[t], ' in ', pattern[:p+1], 'and', text[:t+1]) p = pat_fun[p-1] # print('DEBUG: ', 't reamains', t, 'p comes back to', p, 'we will compare ', pattern[:p+1], 'with text', text[:t+1]) else: # print('DEBUG: ', 'Mismatch! Return to start!', pattern[p], '<>', text[t]) t += 1 # print('DEBUG: ', 't increase to', t, 'p remains', p, 'we will compare', pattern[:p+1], 'with text', text[:t+1]) return results_list def search_naive(pattern, text): result_list = [] for text_start in range(len(text) - len(pattern) + 1): found = True for i in range(len(pattern)): if pattern[i] != text[text_start+i]: found = False break if found: result_list.append(text_start) return result_list def generate_hash(text, pattern): len_text = len(text) len_pattern = len(pattern) # lists with ordinals for charactes from the text and pattern ascii_text = [ord(i) for i in text] ascii_pattern = [ord(i) for i in pattern] # calculate hash for pattern hash_pattern = sum(ascii_pattern) # calculate hash for text len_hash_text = len_text - len_pattern + 1 hash_text = [0] * len_hash_text for i in range(0, len_hash_text): if i == 0: hash_text[i] = sum(ascii_text[:len_pattern]) else: hash_text[i] = (hash_text[i-1] - ascii_text[i-1] + ascii_text[i+len_pattern-1]) return [hash_text, hash_pattern] def search_rabin_karp(pattern,text): hash_text, hash_pattern = generate_hash(text, pattern) # hash_text has length of: len_text - len_pattern + 1 len_text = len(text) len_pattern = len(pattern) result_list = [] for i in range(len(hash_text)): if hash_text[i] == hash_pattern: # it is possible that we have a match matched_chars_count = 0 for j in range(len_pattern): if pattern[j] == text[i+j]: matched_chars_count += 1 else: break # no - it is not a match, aborting! if matched_chars_count == len_pattern: # when matched_chars_count == len_pattern then the full pattern was in the text result_list.append(i) # print(f"DEBUG: Found at index {i}") return result_list def search_boyer_moore(pattern,text): len_text = len(text) len_pattern = len(pattern) result_list = [] t=0 is_match = True while t <= len_text - len_pattern: #print(f'DEBUG: --- {text}') #print(f' --- {" "*t}{pattern}') for p in range(len_pattern-1, -1, -1): # pattern analyzed in reverse mode if pattern[p] != text[t+p]: # mismatch! find bad and good shift is_match = False #print(f'DEBUG: MISMATCH index {t+p}/{p}: text - {text[t+p]} pattern - {pattern[p]}') # GOOD SHIFT CALCULATION if p == len_pattern - 1: # it is end of pattern - no good shift good_shift = 0 #print(f' DEBUG: mismatch on last character - good_shift proposed = {good_shift}') else: # search for substrings matching to already processed suffix k=1 # note: it is reverse pattern processing, adding k makes suffix shorter while text[t+p+k:t+len_pattern] not in pattern[0:len_pattern-1]: k=k+1 if len(text[t+p+k:t+len_pattern]) != 1: # good-suffix found and is longer than 1 - use it! good_shift=t+p+k-pattern[0:len_pattern-1].rfind(text[t+p+k:t+len_pattern]) #print(f' DEBUG: good suffix has length > 1 - good_shift proposed = {good_shift}') else: good_shift=0 # good-suffix found, but has length of 1 - don't use it, bad shift will be better #print(f' DEBUG: good suffix has length of 1 - good_shift proposed = {good_shift}') # BAD SHIFT CALCULATION if text[t+p] in pattern[0:p]: # if bad character exists in the pattern bad_shift=t+p-pattern[0:p].rfind(text[t+p]) # move to this character, so it matches with the pattern and text #print(f' DEBUG: bad character {text[t+p]} found - bad_shift proposed = {bad_shift}') else: bad_shift=t+p+1 # if bad character not found - move analysis after the current char #print(f' DEBUG: bad character not found - bad_shift proposed = {bad_shift}') t=max((bad_shift, good_shift)) #print(f' DEBUG: selected shift to {t}') break if is_match: result_list.append(t) #print(f'DEBUG: Match found on index {t}') t = t+1 else: is_match = True return result_list import random import time max_len = 1000000 #allowed_chars = ['a','b'] allowed_chars = ['a','b',' '] print('Generating random text') start = time.time() text = ''.join(random.choice(allowed_chars) for i in range(max_len)) stop = time.time() print(f'Text generation: {stop - start}') pattern = 'ababababababab' print('Searching') # checking time - naive search start = time.time() results_naive = search_naive(pattern, text) stop = time.time() print(f'Naive search: found {len(results_naive)} matches in {stop - start} seconds') # checking time - KMP search start = time.time() results_kmp = search_kmp(pattern, text) stop = time.time() print(f'KMP search: found {len(results_kmp)} matches in {stop - start} seconds') # checking time - Rabin-Karp search start = time.time() results_rk = search_rabin_karp(pattern, text) stop = time.time() print(f'Rabin-Karp search: found {len(results_rk)} matches in {stop - start} seconds') # checking time - Boyer-Moore search start = time.time() results_bm = search_boyer_moore(pattern, text) stop = time.time() print(f'Boyer-Moore search: found {len(results_bm)} matches in {stop - start} seconds')