Algorytmy, które musisz znać – Wyszukiwanie wzorca w tekście – Algorytm Boyer-Moore

Lekcja – Wyszukiwanie wzorca w tekście – Algorytm Boyer-Moore

def search_boyer_moore(text, pattern):
    
    len_text = len(text)
    len_pattern = len(pattern)
    result_list = []

    t=0
    is_match = True

    while t <= len_text - len_pattern: 
        print(f'DEBUG: --- {text}') 
        print(f' --- {" "*t}{pattern}') 
        for p in range(len_pattern-1, -1, -1): # pattern analyzed in reverse mode 
            if pattern[p] != text[t+p]: # mismatch! find bad and good shift 
                is_match = False 
                print(f'DEBUG: MISMATCH index {t+p}/{p}: text - {text[t+p]} pattern - {pattern[p]}') 
                # GOOD SHIFT CALCULATION 
                if p == len_pattern - 1: # it is end of pattern - no good shift 
                    good_shift = 0 
                    print(f' DEBUG: mismatch on last character - good_shift proposed = {good_shift}') 
                else: # search for substrings matching to already processed suffix 
                    k=1 # note: it is reverse pattern processing, adding k makes suffix shorter 
                    while text[t+p+k:t+len_pattern] not in pattern[0:len_pattern-1]: 
                        k=k+1 
                    if len(text[t+p+k:t+len_pattern]) != 1: # good-suffix found and is longer than 1 - use it! 
                        good_shift=t+p+k-pattern[0:len_pattern-1].rfind(text[t+p+k:t+len_pattern]) 
                        print(f' DEBUG: good suffix has length > 1 - good_shift proposed = {good_shift}')
                    else:
                        good_shift=0                                       # good-suffix found, but has length of 1 - don't use it, bad shift will be better
                        print(f'  DEBUG: good suffix has length of 1 - good_shift proposed = {good_shift}')

                # BAD SHIFT CALCULATION
                if text[t+p] in pattern[0:p]:                           # if bad character exists in the pattern
                    bad_shift=t+p-pattern[0:p].rfind(text[t+p])           # move to this character, so it matches with the pattern and text
                    print(f'  DEBUG: bad character {text[t+p]} found - bad_shift proposed = {bad_shift}')
                else:
                    bad_shift=t+p+1                                       # if bad character not found - move analysis after the current char
                    print(f'  DEBUG: bad character not found - bad_shift proposed = {bad_shift}')

                t=max((bad_shift, good_shift))
                print(f'  DEBUG: selected shift to {t}')
                break
        
        if is_match: 
            result_list.append(t)
            print(f'DEBUG: Match found on index {t}')
            t = t+1
        else: 
            is_match = True

    return result_list

Rozwiązanie

def generate_prefix(pattern):
    pat_len = len(pattern)
    pat_fun = [0] * pat_len
    pref_len = 0    # length of the previous longest prefix suffix
 
    if pat_len <= 1:   # eliminate trivial conditions
        return pat_fun

    pat_fun[0] = 0     # always 0
    i = 1
 
    while i < pat_len:  
        if pattern[i] == pattern[pref_len]:
            pref_len += 1
            pat_fun[i] = pref_len
            i += 1
        else:
            # This is tricky. The pattern may contain multiple sub-prefixes. This allows to use them
            if pref_len != 0:
                pref_len = pat_fun[pref_len-1]
                # we do not increment i here
                # the loop will execute again for the same i and pref_len
                # until a matching prefix would be found or we step down to pref_len == 0
            else:
                pat_fun[i] = 0
                i += 1
    return pat_fun
    

def search_kmp(pattern, text):
    pat_len = len(pattern)
    txt_len = len(text)
    pat_fun = generate_prefix(pattern)

    results_list = []
 
    p = 0 # index pointing to pattern
    t = 0 # index pointing to text

    while t < txt_len:
        if pattern[p] == text[t]:
            # print('DEBUG: ', pattern[:p+1],'matches',text[:t+1])
            t += 1
            p += 1
            if p == pat_len:
                # print ('DEBUG: ', "Found pattern at index", t-p)
                results_list.append(t-p)
                p = pat_fun[p-1] 
                # print('DEBUG: ', 'p comes back to', p)
        else:
            if p != 0:
                # print('DEBUG: ', 'Mismatch!', pattern[p], '<>', text[t], ' in ', pattern[:p+1], 'and', text[:t+1])
                p = pat_fun[p-1]
                # print('DEBUG: ', 't reamains', t, 'p comes back to', p, 'we will compare ', pattern[:p+1], 'with text', text[:t+1])
            else:
                # print('DEBUG: ', 'Mismatch! Return to start!', pattern[p], '<>', text[t])
                t += 1
                # print('DEBUG: ', 't increase to', t, 'p remains', p, 'we will compare', pattern[:p+1], 'with text', text[:t+1])
    
    return results_list



def search_naive(pattern, text):

    result_list = []

    for text_start in range(len(text) - len(pattern) + 1):
        found = True
        for i in range(len(pattern)):
            if pattern[i] != text[text_start+i]:
                found = False
                break
        if found:
            result_list.append(text_start)
    
    return result_list


def generate_hash(text, pattern):
      
    len_text = len(text)           
    len_pattern = len(pattern)     
    # lists with ordinals for charactes from the text and pattern
    ascii_text = [ord(i) for i in text]   
    ascii_pattern = [ord(i) for i in pattern] 
                
    # calculate hash for pattern
    hash_pattern = sum(ascii_pattern)

    # calculate hash for text
    len_hash_text = len_text - len_pattern + 1    
    hash_text = [0] * len_hash_text
    
    for i in range(0, len_hash_text): 
        if i == 0:  
            hash_text[i] = sum(ascii_text[:len_pattern]) 
        else:
            hash_text[i] = (hash_text[i-1] - ascii_text[i-1] + ascii_text[i+len_pattern-1]) 

    return [hash_text, hash_pattern]        



def search_rabin_karp(pattern,text):

    hash_text, hash_pattern = generate_hash(text, pattern) # hash_text has length of:   len_text - len_pattern + 1

    len_text = len(text)              
    len_pattern = len(pattern)        

    result_list = []

    for i in range(len(hash_text)): 
        
        if hash_text[i] == hash_pattern:               # it is possible that we have a match            
            matched_chars_count = 0 
            for j in range(len_pattern): 
                if pattern[j] == text[i+j]: 
                    matched_chars_count += 1  
                else:
                    break                               # no - it is not a match, aborting!

            if matched_chars_count == len_pattern:      # when matched_chars_count == len_pattern then the full pattern was in the text
                result_list.append(i)
#                print(f"DEBUG: Found at index {i}")

    return result_list



def search_boyer_moore(pattern,text):
    
    len_text = len(text)
    len_pattern = len(pattern)
    result_list = []

    t=0
    is_match = True

    while t <= len_text - len_pattern: 
        #print(f'DEBUG: --- {text}') 
        #print(f' --- {" "*t}{pattern}') 
        for p in range(len_pattern-1, -1, -1): # pattern analyzed in reverse mode 
            if pattern[p] != text[t+p]: # mismatch! find bad and good shift 
                is_match = False 
                #print(f'DEBUG: MISMATCH index {t+p}/{p}: text - {text[t+p]} pattern - {pattern[p]}') 

                # GOOD SHIFT CALCULATION 
                if p == len_pattern - 1: # it is end of pattern - no good shift 
                    good_shift = 0 
                    #print(f' DEBUG: mismatch on last character - good_shift proposed = {good_shift}') 
                else: # search for substrings matching to already processed suffix 
                    k=1 # note: it is reverse pattern processing, adding k makes suffix shorter 
                    while text[t+p+k:t+len_pattern] not in pattern[0:len_pattern-1]: 
                        k=k+1 
                    if len(text[t+p+k:t+len_pattern]) != 1: # good-suffix found and is longer than 1 - use it! 
                        good_shift=t+p+k-pattern[0:len_pattern-1].rfind(text[t+p+k:t+len_pattern]) 
                        #print(f' DEBUG: good suffix has length > 1 - good_shift proposed = {good_shift}')
                    else:
                        good_shift=0                                       # good-suffix found, but has length of 1 - don't use it, bad shift will be better
                        #print(f'  DEBUG: good suffix has length of 1 - good_shift proposed = {good_shift}')

                # BAD SHIFT CALCULATION
                if text[t+p] in pattern[0:p]:                           # if bad character exists in the pattern
                    bad_shift=t+p-pattern[0:p].rfind(text[t+p])           # move to this character, so it matches with the pattern and text
                    #print(f'  DEBUG: bad character {text[t+p]} found - bad_shift proposed = {bad_shift}')
                else:
                    bad_shift=t+p+1                                       # if bad character not found - move analysis after the current char
                    #print(f'  DEBUG: bad character not found - bad_shift proposed = {bad_shift}')

                t=max((bad_shift, good_shift))
                #print(f'  DEBUG: selected shift to {t}')
                break
        
        if is_match: 
            result_list.append(t)
            #print(f'DEBUG: Match found on index {t}')
            t = t+1
        else: 
            is_match = True

    return result_list




import random 
import time

max_len = 1000000
#allowed_chars = ['a','b']
allowed_chars = ['a','b',' ']

print('Generating random text')
start = time.time()
text = ''.join(random.choice(allowed_chars) for i in range(max_len))
stop = time.time()
print(f'Text generation:    {stop - start}')

pattern = 'ababababababab'

print('Searching')
# checking time - naive search

start = time.time()
results_naive = search_naive(pattern, text)
stop = time.time()
print(f'Naive search:       found {len(results_naive)} matches in {stop - start} seconds')


# checking time - KMP search

start = time.time()
results_kmp = search_kmp(pattern, text)
stop = time.time()
print(f'KMP search:         found {len(results_kmp)} matches in {stop - start} seconds')


# checking time - Rabin-Karp search

start = time.time()
results_rk = search_rabin_karp(pattern, text)
stop = time.time()
print(f'Rabin-Karp search:  found {len(results_rk)} matches in {stop - start} seconds')


# checking time - Boyer-Moore search

start = time.time()
results_bm = search_boyer_moore(pattern, text)
stop = time.time()
print(f'Boyer-Moore search: found {len(results_bm)} matches in {stop - start} seconds')

Kursy Online

Lekcja – Wyszukiwanie wzorca w tekście – Algorytm Boyer-Moore

Rozwiązanie