diff --git a/include/suffix_tree.h b/include/suffix_tree.h index bc6eb8a8..993db838 100644 --- a/include/suffix_tree.h +++ b/include/suffix_tree.h @@ -1,21 +1,3 @@ -/******************************************************************************* - * ALGORITHM IMPLEMENTAIONS - * - * /\ | _ _ ._ o _|_ |_ ._ _ _ - * /--\ | (_| (_) | | |_ | | | | | _> - * _| - * - * SUFFIX TREE - * - * In computer science, a suffix tree (also called PAT tree or, in an earlier - * form, position tree) is a compressed trie containing all the suffixes of the - * given text as their keys and positions in the text as their values. Suffix - * trees allow particularly fast implementations of many important string - * operations. - * - * http://en.wikipedia.org/wiki/Suffix_tree - ******************************************************************************/ - #include //#include #include @@ -35,280 +17,308 @@ using std::ostream; //typedef tr1::unordered_map map; // TODO: upgrade it to process trace. Rule: char-->elem string-->elem_list -class SuffixTree { - public: - // active point is initialized as (root, None, 0), remainder initialized as 1 - SuffixTree(string str):test_str(str), root(test_str), active_point(&root, 0, 0), remainder(0), pos(0), ls() {} - int construct(void); - - // return -1 if no such sub exist, return the beginning postion of this substring in thr original string if it exist - int search(string sub); - - // return the length of the longest prefix of sub which can be matched in suffix tree - template - Iterator inc_search(Iterator sub) { - typedef typename Iterator::value_type T; // extract real type - - Iterator result = sub; - Node* node = &root; - Edge* edge = NULL; - int pos = 0; // the iter's pos at edge - int edge_len = -1; - bool flag = true; - - while (flag) { - if (edge == NULL) { - edge = node->find_edge(*result); - if (edge == NULL) { - flag = false; - } - else { - result++; - pos = 1; // the second element of the edge - edge_len = edge->length(); - } - } - else { - if (pos >= edge_len) { - node = edge->endpoint; - edge = NULL; - edge_len = 0; - } - else { - if (*result == (*edge)[pos]) { - result++; - pos++; - } - else - flag = false; - } - } +class SuffixTree +{ +public: + // active point is initialized as (root, None, 0), remainder initialized as 1 + SuffixTree(string str):test_str(str), root(test_str), active_point(&root, 0, 0), remainder(0), pos(0), active_e(0), ls() {} + int construct(void); + + // return -1 if no such sub exist, return the beginning postion of this substring in thr original string if it exist + int search(string sub); + + // return the length of the longest prefix of sub which can be matched in suffix tree + template + Iterator inc_search(Iterator sub) + { + typedef typename Iterator::value_type T; // extract real type + + Iterator result = sub; + Node* node = &root; + Edge* edge = NULL; + int pos = 0; // the iter's pos at edge + int edge_len = -1; + bool flag = true; + + + while (flag) { + if (edge == NULL) { + edge = node->find_edge(*result); + if (edge == NULL) { + flag = false; + } + else { + result++; + pos = 1; // the second element of the edge + edge_len = edge->length(); } - - return result; - } - - int print_tree(void); - private: - string test_str; - - struct Node; - typedef struct Node Node; - - struct Edge{ - // the begin and end pos of this edge, note that INT_MAX stands for #(the changing end pos of this entire string) - unsigned int begin, end; - // Is there a better way to find test_str? - string& test_node_str; - - Node * endpoint; - - Edge(unsigned int b, unsigned int e, string& str): test_node_str(str) { - begin = b; - end = e; - endpoint = NULL; - //std::cout << "Edge initialized" << std::endl; } - - void change_edge(unsigned int b, unsigned int e) { - begin = b; - end = e; + else { + if (pos >= edge_len) { + node = edge->endpoint; + edge = NULL; + edge_len = 0; + } + else { + if (*result == (*edge)[pos]) { + result++; + pos++; + } + else + flag = false; + } } + } + + return result; + } + + int print_tree(void); +private: + string test_str; + + struct Node; + typedef struct Node Node; + + struct Edge{ + // the begin and end pos of this edge, note that INT_MAX stands for #(the changing end pos of this entire string) + unsigned int begin, end; + // Is there a better way to find test_str? + string& test_node_str; + + Node * endpoint; + + Edge(unsigned int b, unsigned int e, string& str): + test_node_str(str) + { + begin = b; + end = e; + endpoint = NULL; + std::cout << "Edge initialized" << std::endl; + } - int length(void) { - if (end > test_node_str.size()) - return test_node_str.size() - begin; - else - return end - begin + 1; - } + void change_edge(unsigned int b, unsigned int e) + { + begin = b; + end = e; + } - // needed by map - friend bool operator<(const Edge& me, const Edge& other) { - return me.begin < other.begin; - } + int length(void) + { - char operator[](unsigned int i) { - i += begin; - if (i > end) - throw out_of_range("Edge [] out of range."); + if (end > test_node_str.size()) + return test_node_str.size() - begin; + else + return end - begin + 1; + } + + // needed by map + friend bool operator<(const Edge& me, const Edge& other) + { + return me.begin < other.begin; + } - return test_node_str[i]; - } + char operator[](unsigned int i) + { + i += begin; + if (i > end) + throw out_of_range("Edge [] out of range."); - friend ostream& operator<<(ostream& os, Edge& edge) { - unsigned int end = edge.test_node_str.size()-1; - if (end >= edge.end) - end = edge.end; + return test_node_str[i]; + } - char c; - for (unsigned int i=edge.begin; i<=end; i++) { - c = edge.test_node_str[i]; - os << c; - } - if (end != edge.end) - os << '#'; + friend ostream& operator<<(ostream& os, Edge& edge) + { + unsigned int end = edge.test_node_str.size()-1; + if (end >= edge.end) + end = edge.end; - return os; + char c; + for (unsigned int i=edge.begin; i<=end; i++) { + c = edge.test_node_str[i]; + os << c; } + if (end != edge.end) + os << '#'; - bool is_none(void) { return begin == 0 && end == 0; } - }; - - typedef struct Edge Edge; - - struct Node { - string& test_node_str; - map testmap; - map edges; - // find the edge quicky by storing the leading char of this edge - map findedges; - Node* suffix_link; - - friend class LinkState; + return os; + } - Node(string& str) : test_node_str(str), suffix_link(NULL) { - edges.clear(); - findedges.clear(); - } + bool is_none(void) { return begin == 0 && end == 0; } + }; + typedef struct Edge Edge; + + struct Node{ + string& test_node_str; + map testmap; + map edges; + // find the edge quicky by storing the leading char of this edge + map findedges; + Node* suffix_link; + + friend class LinkState; + + Node(string& str) : + test_node_str(str), suffix_link(NULL) { edges.clear(); findedges.clear(); } + + void add_edge(Edge* edge) { + if (edge->endpoint == NULL) + edge->endpoint = new Node(test_node_str); + make_pair(edge, true); + edges.insert(make_pair(edge, true)); + findedges.insert(make_pair(test_node_str[edge->begin], edge)); + cout << "edge added. Now we have " << edges.size() << "edges." << endl; + } - void add_edge(Edge* edge) { - if (edge->endpoint == NULL) - edge->endpoint = new Node(test_node_str); - make_pair(edge, true); - edges.insert(make_pair(edge, true)); - findedges.insert(make_pair(test_node_str[edge->begin], edge)); - //cout << "edge added. Now we have " << edges.size() << "edges." << endl; + void del_edge(Edge* edge) { + map::iterator iter = edges.find(edge); + + if (iter == edges.end()) + throw out_of_range("edge don't exit"); + else { + // note we should erase the findedges too + edges.erase(edge); + cout << "delete" << (*edge)[0] << endl; + findedges.erase((*edge)[0]); + cout << "edge deleted. Now we have " << edges.size() << "edges." << endl; } - void del_edge(Edge* edge) { - map::iterator iter = edges.find(edge); + } - if (iter == edges.end()) - throw out_of_range("edge don't exit"); - else { - // note we should erase the findedges too - edges.erase(edge); - //cout << "delete" << (*edge)[0] << endl; - findedges.erase((*edge)[0]); - //cout << "edge deleted. Now we have " << edges.size() << "edges." << endl; - } + // find edge by the first char + Edge* find_edge(char c) + { + cout << "finding edge char " << c; + map::iterator iter = findedges.find(c); + cout << " founded? "; + if (iter != findedges.end()) { + cout << "yes." << endl; + return iter->second; } - - // find edge by the first char - Edge* find_edge(char c) { - //cout << "finding edge"; - map::iterator iter = findedges.find(c); - //cout << "founded?" << endl; - if (iter != findedges.end()) - return iter->second; - else - return NULL; - } - - bool isleaf() { return edges.empty(); } - - bool operator==(Node& other) { - return (this) == (&other); + else { + cout << "no." << endl; + return NULL; } + } - friend ostream& operator<<(ostream& os, Node& node) { - map::iterator iter; - map::iterator iter_f; - - for (iter=node.edges.begin(); iter!=node.edges.end(); ++iter) - os << iter->first << '\t'; - os << endl; - - for (iter_f=node.findedges.begin(); iter_f!=node.findedges.end(); ++iter_f) - os << iter_f->first << "-->" << iter_f->second << endl; + bool isleaf() { return edges.empty(); } - return os; - } - }; - //typedef struct Node Node; - - class ActivePoint { - public: - Node* active_node; - char active_edge; - int active_length; - - ActivePoint(Node* node, char edge, int length): active_node(node), - active_edge(edge), active_length(length) { std::cout << "ActivePoint initialized" << std::endl; } - }; - - Node root; - ActivePoint active_point; - - Node* get_active_node(void) { return active_point.active_node; } - void set_active_node(Node* node) { active_point.active_node = node; cout << "Active node set as " << node << endl; } - char get_active_edge(void) { return active_point.active_edge; } - void set_active_edge(char edge) { active_point.active_edge = edge; } - int get_active_length(void) { return active_point.active_length; } - void set_active_length(int len) { active_point.active_length = len; } - void inc_active_len() { active_point.active_length++; } - void dec_active_len() { active_point.active_length--; } - - // how many suffixes is to be inserted? - int remainder; - // how many characters inserted? - unsigned int pos; - char get_ele(int i) { return test_str[i]; } - // insert a char from pos to suffix tree - int insert(); - int insert_rule1(); - int insert_rule3(); - int print_node(Node* node, int level); - - - Node* seperate_edge(Node * node, Edge* edge, int rule); - - // check if we can change active node - void check_an(void) { - Node* node = get_active_node(); - Edge* edge = node->find_edge(get_active_edge()); - - if (edge == NULL) - return; - - int edge_size = edge->end - edge->begin + 1; - - // update - if (edge_size == get_active_length()) { - set_active_node(edge->endpoint); - set_active_edge(0); - set_active_length(0); - } + bool operator==(Node& other) + { + return (this) == (&other); } - // this class indicate when shall we insert a suffix link - // ls should be a singleton - class LinkState { - bool first; + friend ostream& operator<<(ostream& os, Node& node) + { + map::iterator iter; + map::iterator iter_f; - Node* prev, *curr; + for (iter=node.edges.begin(); iter!=node.edges.end(); ++iter) + os << iter->first << '\t'; + os << endl; + + for (iter_f=node.findedges.begin(); iter_f!=node.findedges.end(); ++iter_f) + os << iter_f->first << "-->" << iter_f->second << endl; - public: - LinkState() : first(true), prev(NULL), curr(NULL) {} + return os; + } + }; + //typedef struct Node Node; - void ins_link(Node* node) { - prev = curr; - curr = node; + class ActivePoint{ + public: + Node* active_node; + char active_edge; + int active_length; + + ActivePoint(Node* node, char edge, int length): + active_node(node), active_edge(edge), active_length(length) { std::cout << "ActivePoint initialized" << std::endl; } + }; + + Node root; + ActivePoint active_point; + + Node* get_active_node(void) { return active_point.active_node; } + void set_active_node(Node* node) { active_point.active_node = node; cout << "Active node set as " << node << endl; } + char get_active_edge(void) + { + return test_str[active_e]; + } + + int get_active_length(void) { return active_point.active_length; } + void set_active_length(int len) { active_point.active_length = len; } + void inc_active_len() { active_point.active_length++; } + void dec_active_len() { active_point.active_length--; } + + // how many suffixes is to be inserted? + int remainder; + // how many characters inserted? + unsigned int pos; + unsigned int active_e; // the beginnig position of suffixes need to be inserted + char get_ele(int i) { return test_str[i]; } + // insert a char from pos to suffix tree + int insert(); + int insert_rule1(); + int insert_rule3(); + int print_node(Node* node, int level); + + + Node* seperate_edge(Node * node, Edge* edge); + + // check if we can change active node + bool check_active_node(void) + { + Node* node = get_active_node(); + char a_char = get_active_edge(); + Edge* edge = node->find_edge(a_char); + + if (edge == NULL) + return false; + + unsigned int edge_size = edge->end - edge->begin + 1; + unsigned int length = get_active_length(); + + // update + if (length >= edge_size) { + set_active_node(edge->endpoint); + set_active_length(length-edge_size); + active_e += edge_size; + + return true; + } + return false; + } - if (!first) { - prev->suffix_link = curr; - cout << "Suffix link added from prev " << prev << " to curr " << curr << endl; - } + // this class indicate when shall we insert a suffix link + // ls should be a singleton + class LinkState + { + bool first; + + Node* prev, *curr; - first = false; + public: + LinkState() : first(true), prev(NULL), curr(NULL) {} + + void ins_link(Node* node) + { + prev = curr; + curr = node; + + if (first == false) { + prev->suffix_link = curr; + cout << "Suffix link added from prev " << prev << " to curr " << curr << endl; } - void clear(void) { - first = true; - prev = curr = NULL; - } - }; + first = false; + } - LinkState ls; + void clear(void) + { + first = true; + prev = curr = NULL; + } + }; + LinkState ls; }; diff --git a/src/suffix_tree_demo.cpp b/src/suffix_tree_demo.cpp index e3ce387c..8862e9b2 100644 --- a/src/suffix_tree_demo.cpp +++ b/src/suffix_tree_demo.cpp @@ -50,223 +50,86 @@ int SuffixTree::construct(void) { // test_str shouldn't have '#' until now test_str = test_str + "#"; + using std::numeric_limits; while (pos < test_str.size()) { ls.clear(); remainder++; - //cout << "Char: " << test_str[pos] << endl; + cout << "Char: " << test_str[pos] << endl; + + while (remainder) { + int length = get_active_length(); + if (length == 0) + active_e = pos; + + Node* node = active_point.active_node; + char a_char = get_active_edge(); + Edge* a_edge = node->find_edge(a_char); + + + if (a_edge == NULL) { + Edge* newedge = new Edge(pos, numeric_limits::max(), test_str); + node->add_edge(newedge); + ls.ins_link(node); + } + else { + if (check_active_node()) + continue; + + char expected_ele = (*a_edge)[get_active_length()]; + if (expected_ele == get_ele(pos)) { + inc_active_len(); + ls.ins_link(node); + break; + } + Node *newnode = seperate_edge(node, a_edge); + Edge* newedge = new Edge(pos, numeric_limits::max(), test_str); + newnode->add_edge(newedge); + ls.ins_link(newnode); + } + remainder--; + if (node == &root && get_active_length() > 0) { + dec_active_len(); + active_e = pos - remainder + 1; + } + else if (node->suffix_link) { + set_active_node(node->suffix_link); + } + else + set_active_node(&root); + } - bool flag = true; - while (flag) - flag = insert(); pos++; } return 0; } -int SuffixTree::insert(void) -{ - int result = 0; - - Node* node = active_point.active_node; - if (node == (&root)) { - //cout << "ActiveNode is root." << endl; - result = insert_rule1(); - } - else { - //cout << "ActiveNode isn't root." << endl; - result = insert_rule3(); - } - - return result; -} - -// rule1 applies when the active node is root -int SuffixTree::insert_rule1(void) +SuffixTree::Node* SuffixTree::seperate_edge(Node * node, Edge* a_edge) { - using std::numeric_limits; - - //cout << "Rule 1" << endl; - Node* node = &root; - - Edge* a_edge = node->find_edge(get_active_edge()); - - // next active edge - char active_char = 0; - - // can we find a match at active node? - Edge* possible = NULL; - bool will_insert = false; - if (get_active_length() != 0 && a_edge != NULL) { - // shouldn't throw out_of_range here, e.g. abcabc* - char match_char = (*a_edge)[get_active_length()]; - if (match_char == get_ele(pos)) - possible = a_edge; - else - will_insert = true; // will insert while active length is not 0 and activechar don't match - //cout << "Active char is " << active_char << endl; - - // node for insertion - } - else if (get_active_length() == 0) { - //cout << "Active char is NULL." << endl; - possible = node->find_edge(get_ele(pos)); - - // new active edge here and only here! - if (possible) - active_char = get_ele(pos); - else - active_char = 0; - } - else { - cout << "Error!!!!!!!!!!!!!!!!!!!1" << endl; - //throw; - } - - - if (possible) { - remainder++; - - // if not 0, then it's not a new edge, should not set - if (get_active_length() == 0) - set_active_edge(active_char); - - inc_active_len(); - check_an(); - } - else { - // seperate the old edge, set new active edge - if (a_edge != NULL) { - node = seperate_edge(node, a_edge, 1); - } - else - set_active_edge(0); - - //cout << "append a new edge at endpoint" << endl; - Edge* new_edge2 = new Edge(pos, numeric_limits::max(), test_str); - //cout << node << endl; - node->add_edge(new_edge2); - } - - remainder--; - - return will_insert; -} - -SuffixTree::Node* SuffixTree::seperate_edge(Node * node, Edge* a_edge, int rule) -{ - //cout << "seperate the old edge here: " << (*a_edge) << endl; - - char active_char; - - if (remainder > 2) - active_char = (*a_edge)[1]; - else - active_char = get_ele(pos); - + cout << "seperate the old edge here: " << (*a_edge) << endl; int new_begin = a_edge->begin + get_active_length(); int new_end = a_edge->end; int old_begin = a_edge->begin; int old_end = new_begin - 1; - //cout << node->find_edge(active_char) << "|||||||||||||||||||||||||| char " << active_char << endl; - //cout << (*node); + cout << (*node); node->del_edge(a_edge); a_edge->change_edge(new_begin, new_end); Edge* old_edge1 = new Edge(old_begin, old_end, test_str); node->add_edge(old_edge1); - //cout << node->find_edge(active_char) << "||||||||||||||||||||||||||2 char " << active_char << endl; old_edge1->endpoint->add_edge(a_edge); - //cout << (*node); -// old_edge1->endpoint->suffix_link = a_edge->endpoint->suffix_link; -// a_edge->endpoint->suffix_link = NULL; -/*----------------------------------------------------------------------- - Edge* new_edge1 = new Edge(new_begin, new_end, test_str); - a_edge->endpoint->add_edge(new_edge1); -------------------------------------------------------------------*/ - - //cout << "change edge" << endl; - //cout << "What's wrong?" << endl; cout << "The old edge split as -- " << (*a_edge) << " and -- " << (*old_edge1) << endl; - //cout << "What's wrong?" << endl; - - if (rule == 1) { - set_active_edge(active_char); - dec_active_len(); - } - else if (rule == 3) { - Node* n = &root; // new active node - //cout << node; - if (node->suffix_link) { - n = node->suffix_link; - cout << " Moved to suffix link!--------------" << endl; - } - else - cout << " Moved to root!------------------" << endl; - set_active_node(n); - } cout << "root " << (&root) << endl; - //cout << node << endl; + cout << node << endl; Node* new_node = old_edge1->endpoint; - ls.ins_link(new_node); - //cout << node << endl; - - return new_node; -} - -// applies when the active is not root -int SuffixTree::insert_rule3() -{ - //cout << "Rule3" << endl; - Node * node = get_active_node(); - cout << "Active node " << node << endl; - Edge * edge = node->find_edge(get_active_edge()); - - // input match a suffix? - bool match = false; - if (get_active_length() == 0) { - if (node->find_edge(get_ele(pos))) { - match = true; - - set_active_edge(get_ele(pos)); - inc_active_len(); - check_an(); - } - } - else { - // assert edge is not NULL - char match_char = (*edge)[get_active_length()]; - if (match_char == get_ele(pos)) { - match = true; - - inc_active_len(); - check_an(); - } - } - - if (match) - return 0; - - if (edge != NULL) { - node = seperate_edge(node, edge, 3); - } - - using std::numeric_limits; - - //cout << "append a new edge at endpoint" << endl; - Edge* new_edge2 = new Edge(pos, numeric_limits::max(), test_str); cout << node << endl; - node->add_edge(new_edge2); - remainder--; - - return 1; // should insert again at a different node - + return new_node; } int SuffixTree::print_tree() @@ -313,7 +176,7 @@ using namespace std; int main() { cout << "Begining" << endl; - SuffixTree st("BANANAS"); + SuffixTree st("mississippi"); cout << "Constructing..." << endl; st.construct();