如何编写完整优化的SuffixTree代码
本篇文章为大家展示了如何编写完整优化的SuffixTree代码,内容简明扼要并且容易理解,绝对能使你眼前一亮,通过这篇文章的详细介绍希望你能有所收获。
创新互联是一家专注于成都做网站、网站建设与策划设计,镇坪网站建设哪家好?创新互联做网站,专注于网站建设十年,网设计领域的专业建站公司;建站业务涵盖:镇坪等地区。镇坪做网站价格咨询:18980820575
一些朋友提醒,这次一次放出实现了Ukkonen 的paper的三个优化的完整SuffixTree的代码。看了之前博客的只实现SuffixLink优化的代码,再看这个应该就很简单了。
下面是SuffixTree的头文件SuffixTree.h
#pragma once #include#include using namespace std; class SuffixNode { public: vector m_pSons; SuffixNode* m_pFarther; SuffixNode* m_pSuffixLink; int m_iPathPos; int m_iEdgeStart; int m_iEdgeEnd; }; class SuffixTree { public: int m_iE;//The virtual end of all leaves. SuffixNode* m_pRoot;//The root of the tree. string m_czTreeStr;//the string that the tree represent. }; //Means a sub string of the suffix tree (string[beging],string[end]). class TreePath { public: int m_iBegin; int m_iEnd; }; //Represent the char in a node's incoming edge. class TreePos { public: TreePos() { m_iEdgePos = 0; m_pNode = NULL; } TreePos(int edgePos,SuffixNode* pNode) { m_iEdgePos = edgePos; m_pNode = pNode; } int m_iEdgePos;//The ith char of the incoming edge. SuffixNode* m_pNode;//The node we are going to search. }; //=====================================Class Declarations============================== void SingleCharExtesion(SuffixTree* pTree,TreePos*& pPos ,TreePath extendStrPath,int* firstExtensionFlag,bool * rule3Applied,int* iLeafNum); /* Add s[0....i+1],s[1...i+1].... to the suffix tree Input: SuffixNode* pNode : When we only use trick 1,pNode is the pointer to the longest leaf,s[0........i]. phase : Equals i+1 in the paper. */ void SinglePhaseExtend(SuffixTree* pTree,TreePos* & pPos,int phase,int* iExtension,int* ruleApplied,bool* lastRule3Applied,int *iLeafNum); SuffixNode* CreateTreeNode(SuffixNode* pFarther,int iedgeStart,int iedgeEnd,int pathPos); /* FollowSuffixLink : Follows the suffix link of the source node according to Ukkonen's rules(jump from s[j-1...i] to s[j....i]). Input : The tree, and node. The node is the last internal node we visited. Output: The destination node that represents the longest suffix of node's path. Example: if node represents the path "abcde" then it returns the node that represents "bcde". */ void FollowSuffixLink(SuffixTree* pTree,TreePos*& pPos, TreePath strji); int GetNodeLabelLength(SuffixTree* pTree, SuffixNode* pNode); int GetNodeLabelEnd(SuffixTree* pTree,SuffixNode* pNode); /* Find the son node which starts with the char,ch. */ SuffixNode* Find_Son(SuffixTree* pTree,SuffixNode* pFarNode, char ch); bool IsTheLastCharInEdge(SuffixTree* pTree, SuffixNode* pNode, int edge_pos); SuffixNode* ApplyExtensionRule2(SuffixNode* pNode,int edgeLabelBeg,int edgeLabelEnd,int edgePos,int pathPos,bool newLeafFlag); /* Trace the sub string(TreePath str) from the node(SuffixNode* pNode). Input: int* edgePos : where the last char is found at that edge int* charsFound : how many chars of str have been found. bool skipFlag : Use skip trick or not. */ SuffixNode* TraceString(SuffixTree* pTree,SuffixNode* pNode,TreePath str,int* edgePos,int* charsFound,bool skipFlag); /* Trace the substring(TreePath strPath) in one single edge out of pNode. */ SuffixNode* TraceSingleEdge(SuffixTree* pTree,SuffixNode* pNode,TreePath strPath,int* charsFound,int* edgePos,bool* searchDone,bool skipFlag); SuffixNode* CreateFirstCharacter(SuffixTree* pTree);//Add the first character to the suffix tree. SuffixTree* CreateSuffixTree(string tStr); /* For Debug: See if the sub string (from root to pPos) equals pTree->string[subPath.m_iBegin,subPath.m_iEnd] */ bool TestPosSubStringEqualPath(SuffixTree* pTree,TreePos *pPos, TreePath subPath); /* For debugging, We want to see if the suffix link from linkFrom to linkTo is right. */ bool TestSuffixLinkMatch(SuffixTree *pTree, SuffixNode* linkFrom,SuffixNode *linkTo); int FindSubString(SuffixTree* pTree,string subStr); //=====================================================================================
下面是SuffixTree的实现文件SuffixTree.cpp
#pragma once #include "SuffixTree.h" #includeusing namespace std; SuffixNode* pNodeNoSuffixLink=NULL; //=====================================Class Definitions============================== /* Trace the substring(TreePath strPath) in one single edge going out of pNode. Input: int* edgeCharsFound : how many characters we find matched in the outgoing edge of pNode. */ SuffixNode* TraceSingleEdge(SuffixTree* pTree,SuffixNode* pNode,TreePath strPath,int* edgeCharsFound,int* edgePos,bool* searchDone,bool skipFlag) { //Find outgoing edge of pNode with our first character. SuffixNode* nextNode = Find_Son(pTree,pNode,pTree->m_czTreeStr[strPath.m_iBegin]); *searchDone = true; if(nextNode == NULL) {//There is no match in pNode's sons,so we can only return to pNode. *edgePos = GetNodeLabelLength(pTree,pNode); *edgeCharsFound = 0; return pNode; } int edgeLen = GetNodeLabelLength(pTree,nextNode); int strLen = strPath.m_iEnd - strPath.m_iBegin + 1; if(skipFlag == true)//Use the trick1 : skip { if(edgeLen < strLen) { *searchDone = false; *edgeCharsFound = edgeLen; *edgePos = edgeLen - 1; } else if(edgeLen == strLen) { *edgeCharsFound = edgeLen; *edgePos = edgeLen - 1; } else { *edgeCharsFound = strLen; *edgePos = strLen - 1; } return nextNode; } else//No skip,match each char one after another { *edgePos = 0; *edgeCharsFound = 0; //Find out the min length if(strLen < edgeLen) edgeLen = strLen; for(*edgeCharsFound=1,*edgePos=1;(*edgePos) m_czTreeStr[ nextNode->m_iEdgeStart + *edgePos ] != pTree->m_czTreeStr[strPath.m_iBegin + *edgePos ]) { (*edgePos)--; return nextNode; } } } //When it comes here, (*edgePos) is one more; (*edgePos)--; if(*edgeCharsFound < strLen) { *searchDone = false; } return nextNode; } /* Trace the sub string(TreePath str) from the node(SuffixNode* pNode). Input: int* edgePos :For output , where the last char is found at that edge int* charsFound : How many chars of str have been found. bool skipFlag : Use skip trick or not. */ SuffixNode* TraceString(SuffixTree* pTree,SuffixNode* pNode,TreePath str,int* edgePos,int* charsFound,bool skipFlag) { bool searchDone=false; *charsFound = 0; *edgePos=0 ; int edgeCharsFound=0; while(searchDone==false) { edgeCharsFound = 0; *edgePos=0; pNode = TraceSingleEdge(pTree,pNode,str,&edgeCharsFound,edgePos,&searchDone,skipFlag); str.m_iBegin += edgeCharsFound; *charsFound += edgeCharsFound; } //if(*charsFound == 0) // return NULL; return pNode; } /* Input: (1) pNode : the node who is going to add a new son or whose edge is going to be split. (2) edgeLabelBeg : when newleafFlag==true,it's the edge begin label of the new leaf. when when newleafFlag==false, it's the edge begin label of the new new leaf( the leaf of s[i+1], not s[i]). (3) like above : just the end (4 )int edgePos : where split is done to pNode if newLeafFlag==false (the 0th position or 1th position or...) */ SuffixNode* ApplyExtensionRule2(SuffixNode* pNode,int edgeLabelBeg,int edgeLabelEnd,int edgePos,int pathPos,bool newLeafFlag) { if(newLeafFlag==true) { //Add an new leaf SuffixNode* newLeaf = CreateTreeNode(pNode,edgeLabelBeg,edgeLabelEnd,pathPos); return newLeaf; } else { //Add an new internal node and an new leaf //First create the new internal node. SuffixNode* nInternalNode = CreateTreeNode(pNode->m_pFarther,pNode->m_iEdgeStart,pNode->m_iEdgeStart + edgePos,pNode->m_iPathPos); //Remove pNode from its farther's sons for(vector ::iterator pNodeIter=pNode->m_pFarther->m_pSons.begin(); pNodeIter!=pNode->m_pFarther->m_pSons.end();pNodeIter++) { if( pNode == *pNodeIter ) { pNode->m_pFarther->m_pSons.erase(pNodeIter); break; } } //Adjust pNode's information. pNode->m_iEdgeStart += (edgePos + 1); pNode->m_pFarther = nInternalNode; nInternalNode->m_pSons.push_back(pNode); //Create the new leaf for s[i+1] SuffixNode* nLeafNode = CreateTreeNode(nInternalNode,edgeLabelBeg,edgeLabelEnd,pathPos); return nInternalNode; } } bool IsTheLastCharInEdge(SuffixTree* pTree, SuffixNode* pNode, int edge_pos) { if( edge_pos == GetNodeLabelLength(pTree,pNode) - 1 ) return true; return false; } int GetNodeLabelEnd(SuffixTree* pTree,SuffixNode* pNode) { if(pNode->m_pSons.size() == NULL) { return pTree->m_iE; } return pNode->m_iEdgeEnd; } int GetNodeLabelLength(SuffixTree* pTree, SuffixNode* pNode) { int length = GetNodeLabelEnd(pTree,pNode) - pNode->m_iEdgeStart + 1; return length; } SuffixNode* CreateTreeNode(SuffixNode* pFarther,int iedgeStart,int iedgeEnd,int pathPos) { SuffixNode* pNode=new SuffixNode(); pNode->m_iEdgeStart = iedgeStart; pNode->m_iEdgeEnd = iedgeEnd; pNode->m_pFarther = pFarther; pNode->m_iPathPos = pathPos; pNode->m_pSuffixLink = NULL; if(pFarther!=NULL) pFarther->m_pSons.push_back(pNode); return pNode; } //Find the son node which starts with the ch SuffixNode* Find_Son(SuffixTree* pTree,SuffixNode* pFarNode, char ch) { for(vector ::iterator nodeIter=pFarNode->m_pSons.begin(); nodeIter!=pFarNode->m_pSons.end();nodeIter++) { if(pTree->m_czTreeStr[(*nodeIter) -> m_iEdgeStart] == ch ) { return *nodeIter; } } return NULL; } /* FollowSuffixLink : Follows the suffix link of the source node according to Ukkonen's rules(jump from s[j-1...i] to s[j....i]). Input : The tree, and node. The node is the last internal node we visited. Output: The destination node that represents the longest suffix of node's path. Example: if node represents the path "abcde" then it returns the node that represents "bcde". */ void FollowSuffixLink(SuffixTree* pTree,TreePos* & pPos,TreePath strji) { /*gama : the string(r in Gusfield's paper) between node and its father. If the node doesn't have suffix link , we need to go up to its farther*/ TreePath gama; if(pPos->m_pNode == pTree->m_pRoot) { pPos->m_iEdgePos = 0; return; } // No suffix link,walk up at most one step(if it is not the root). if( pPos->m_pNode->m_pSuffixLink == NULL ) { if(pPos->m_pNode->m_pFarther == pTree->m_pRoot) {//its farther is the root pPos->m_pNode = pTree->m_pRoot; pPos->m_iEdgePos = 0; return; } else { // Find the gamma (the substring between pPos's parent's and pPos's) gama.m_iBegin = pPos->m_pNode->m_iEdgeStart; gama.m_iEnd = pPos->m_pNode->m_iEdgeStart + pPos->m_iEdgePos;// the end of s[j..i] //Follow farther's suffix link pPos->m_pNode = pPos->m_pNode->m_pFarther->m_pSuffixLink; //Down-walk gamma (until we found s[i],the character we add last extension) int charsFound=0; pPos->m_pNode = TraceString(pTree,pPos->m_pNode,gama,&pPos->m_iEdgePos,&charsFound,true); //////////////////////////////////////////////// } } else { //A suffix link exists - just follow it. pPos->m_pNode = pPos->m_pNode->m_pSuffixLink; pPos->m_iEdgePos = GetNodeLabelLength(pTree,pPos->m_pNode) - 1; //The last char of pPos's suffix link represents s[i] (the character we add last extension). } return; } /* For Debug: See if the sub string (from root to pPos) equals pTree->string[subPath.m_iBegin,subPath.m_iEnd] */ bool TestPosSubStringEqualPath(SuffixTree* pTree,TreePos *pPos, TreePath subPath) { if(pTree->m_pRoot == pPos->m_pNode ) { return true; } int strRevIndex = subPath.m_iEnd; SuffixNode* tmpNode = pPos->m_pNode; int edgeRevIndex = tmpNode->m_iEdgeStart + pPos->m_iEdgePos; while(tmpNode != pTree->m_pRoot) { while( edgeRevIndex >= tmpNode->m_iEdgeStart && strRevIndex >= subPath.m_iBegin) { if( pTree->m_czTreeStr[edgeRevIndex] != pTree->m_czTreeStr[strRevIndex] ) { return false; } edgeRevIndex--; strRevIndex--; } tmpNode = tmpNode->m_pFarther; edgeRevIndex = tmpNode->m_iEdgeEnd; } if(strRevIndex != subPath.m_iBegin-1) return false; return true; } /* Input: (1) SuffixTree* pTree : The suffix tree (2) TreePos* pPos : The last internal node we visited , then we are going to jump to its suffix link in this extension. (3) TreePath extendStrPath : The suffix (s[j...i+1]) we are goint to add to the tree. */ void SingleCharExtesion(SuffixTree* pTree,TreePos*& pPos ,TreePath extendStrPath,int* ruleApplied,bool* lastRule3Applied,int *iLeafNum) { TreePath sji; sji.m_iBegin = extendStrPath.m_iBegin; sji.m_iEnd = extendStrPath.m_iEnd - 1; int pathPos = extendStrPath.m_iBegin; if(*lastRule3Applied == false) { //Ready to jump from suffix link at or above s[j-1...i] that either has a suffix link to s[j...i] or the root . FollowSuffixLink(pTree,pPos,sji); } else { *lastRule3Applied = false; } //////////////////////////////////////////For Debug////////////////////////////////////////////////// if(sji.m_iEnd >= sji.m_iBegin) { if(TestPosSubStringEqualPath(pTree,pPos, sji) == false && pPos->m_pNode != pTree->m_pRoot) { cout<<"FollowSuffixLink doesn't go to the right s[j..i]:"< m_pNode == pTree->m_pRoot && sji.m_iEnd >= sji.m_iBegin) { pPos->m_pNode = TraceString(pTree,pTree->m_pRoot,sji,&pPos->m_iEdgePos,&chars_found,false); if(pPos->m_pNode == NULL) { pPos->m_iEdgePos = 0; pPos->m_pNode =pTree->m_pRoot; //////////////////////////////////For Debug////////////////////////////////////// if(sji.m_iBegin != sji.m_iEnd) { cout<<"There is s[j-1..i](not empty) doesn't exist!"< m_pNode,pPos->m_iEdgePos)) { SuffixNode* pTmp = Find_Son(pTree,pPos->m_pNode,pTree->m_czTreeStr[extendStrPath.m_iEnd]); if(pTmp != 0) { //s[i+1] exits already. chars_found = 1; *ruleApplied = 3; //We found s[i+1] which is already in the tree. pPos->m_iEdgePos = 0; pPos->m_pNode = pTmp; if(pNodeNoSuffixLink != NULL) { pNodeNoSuffixLink->m_pSuffixLink = pPos->m_pNode->m_pFarther;//Let s[j-1..i] jump to s[j...i] ///////////////////////////For Debug////////////////////////////////////////////////////////////////// if(!TestSuffixLinkMatch(pTree,pNodeNoSuffixLink ,pNodeNoSuffixLink->m_pSuffixLink)) { cout<<"Suffix Tree Build Error"< m_czTreeStr[ pPos->m_pNode->m_iEdgeStart + pPos->m_iEdgePos + 1] == pTree->m_czTreeStr[extendStrPath.m_iEnd] )//Notice that " + 1 " means the next char of s[j...i] : yes, s[i+1] {//s[i+1] exits already. chars_found = 1; if(pPos->m_pNode->m_pSons.size() == 0)//We found s[i+1] in the leaf { if( pPos->m_iEdgePos + 1 == GetNodeLabelLength(pTree,pPos->m_pNode) - 1) //Rule1 applied, we just add s[i+1] to the leaf. { *ruleApplied = 1; } else { *ruleApplied = 3; pPos->m_iEdgePos++;//Go to the s[i+1] } } else//We found s[i+1] in the internal node { *ruleApplied = 3; pPos->m_iEdgePos++;//Go to s[i+1] if(pNodeNoSuffixLink != NULL) { pNodeNoSuffixLink->m_pSuffixLink = pPos->m_pNode; ///////////////////////////For Debug////////////////////////////////////////////////////////////////// if(! TestSuffixLinkMatch(pTree,pNodeNoSuffixLink ,pNodeNoSuffixLink->m_pSuffixLink)) { cout<<"Suffix Tree Build Error"< m_pNode,pPos->m_iEdgePos) || pPos->m_pNode==pTree->m_pRoot) { if(pPos->m_pNode->m_pSons.size() != 0) { //Internal node or root,apply rule2 that add a new leaf ApplyExtensionRule2(pPos->m_pNode, extendStrPath.m_iEnd, extendStrPath.m_iEnd, 0, pathPos, true); //Suffix Link if(pNodeNoSuffixLink != NULL) { pNodeNoSuffixLink->m_pSuffixLink = pPos->m_pNode; ///////////////////////////For Debug////////////////////////////////////////////////////////////////// if(! TestSuffixLinkMatch(pTree,pNodeNoSuffixLink ,pNodeNoSuffixLink->m_pSuffixLink)) { cout<<"Suffix Tree Build Error"< m_pNode,extendStrPath.m_iEnd,extendStrPath.m_iEnd,pPos->m_iEdgePos,pathPos,false); if(pNodeNoSuffixLink != NULL) { pNodeNoSuffixLink->m_pSuffixLink = nInternalNode; ///////////////////////////For Debug////////////////////////////////////////////////////////////////// if(! TestSuffixLinkMatch(pTree,pNodeNoSuffixLink ,pNodeNoSuffixLink->m_pSuffixLink)) { cout<<"Suffix Tree Build Error"< m_pFarther == pTree->m_pRoot) { nInternalNode->m_pSuffixLink = pTree->m_pRoot; pNodeNoSuffixLink = NULL; } else { pNodeNoSuffixLink = nInternalNode; } //Adjust the node for the next extension pPos->m_pNode = nInternalNode; (*iLeafNum)++; *ruleApplied = 2; } return ; } /* Add s[0....i+1],s[1...i+1].... to the suffix tree Input: SuffixNode* pNode: When we only use trick 1,pNode is the pointer to the longest leaf,s[0........i]. */ void SinglePhaseExtend(SuffixTree* pTree,TreePos* & pPos,int phase,int* iExtension,int* ruleApplied,bool *lastRule3Applied,int *iLeafNum) { //pTree->m_iE = phase-1; //int ruleApplied=-1; while(*iExtension <= phase ) { TreePath extendPath; extendPath.m_iBegin = *iExtension; extendPath.m_iEnd = phase; SingleCharExtesion(pTree,pPos,extendPath,ruleApplied,lastRule3Applied,iLeafNum); if(*ruleApplied == 3)//We already found s[j...i+1] which means s[j+1...i+1] and the rest are already in the tree too. { *lastRule3Applied = true; break; } (*iExtension)++; } return; } SuffixNode* CreateFirstCharacter(SuffixTree* pTree) { pTree->m_iE = 0; SuffixNode* firstLeaf = CreateTreeNode(pTree->m_pRoot,0,0,0); return firstLeaf; } SuffixTree* CreateSuffixTree(string tStr) { SuffixTree* psTree=new SuffixTree(); psTree->m_czTreeStr = tStr+"$"; psTree->m_pRoot = CreateTreeNode(NULL,0,0,0); //Add the first char into it. SuffixNode* firstLeaf = CreateFirstCharacter(psTree); TreePos* lastLeafPos = new TreePos(0,firstLeaf); int ruleApplied = 2; int iExtension = 1; int iLeafNum = 1; bool lastRule3Applied = false; for(int phase = 1 ; phase m_czTreeStr.length() ; phase++) { psTree->m_iE = phase; iExtension = iLeafNum; //lastLeafPos->m_iEdgePos = lastLeafPos->m_pNode->m_iEdgeEnd - lastLeafPos->m_pNode->m_iEdgeStart; //start from s[j..i] SinglePhaseExtend(psTree,lastLeafPos,phase,&iExtension,&ruleApplied,&lastRule3Applied,&iLeafNum); } return psTree; } int FindSubString(SuffixTree* pTree,string subStr) { SuffixNode* node = Find_Son(pTree,pTree->m_pRoot,subStr[0]); if(node == NULL) { return -1; } int startIndex = node->m_iEdgeStart; int strIndex=0; int edgeIndex; while(node != NULL) { edgeIndex = node->m_iEdgeStart; int edgeLabelEnd = GetNodeLabelEnd(pTree,node); while(strIndex < subStr.length() && edgeIndex <= edgeLabelEnd && pTree->m_czTreeStr[edgeIndex] == subStr[strIndex]) { strIndex++; edgeIndex++; } if(strIndex == subStr.length()) { //we found it return node->m_iPathPos; } else if(edgeIndex > node->m_iEdgeEnd) { node = Find_Son(pTree,node,subStr[strIndex]); } else { return -1; } } return -1; } /* For debugging, We want to see if the suffix link from linkFrom to linkTo is right. */ bool TestSuffixLinkMatch(SuffixTree *pTree, SuffixNode* linkFrom,SuffixNode *linkTo) { char ch2,ch3; while(linkFrom->m_pFarther->m_pFarther!=pTree->m_pRoot && linkFrom->m_pFarther != pTree->m_pRoot) { linkFrom = linkFrom->m_pFarther; } if(linkFrom->m_pFarther == pTree->m_pRoot) { if(linkFrom->m_iEdgeEnd == linkFrom->m_iEdgeStart) { if(linkTo != pTree->m_pRoot) { return false; } else { return true; } } else { ch2 = pTree->m_czTreeStr[linkFrom->m_iEdgeStart + 1]; } } else { if(linkFrom->m_pFarther->m_iEdgeStart == linkFrom->m_pFarther->m_iEdgeEnd) { ch2 = pTree->m_czTreeStr[linkFrom->m_iEdgeStart]; } else { ch2 = pTree->m_czTreeStr[linkFrom->m_pFarther->m_iEdgeStart + 1]; } } while(linkTo->m_pFarther != pTree->m_pRoot) { linkTo = linkTo->m_pFarther; } ch3 = pTree->m_czTreeStr[linkTo->m_iEdgeStart]; if(ch2 == ch3) return true; else return false; }
最后是调用的主函数,一个简单的判断子串是否在字符串中,如果子串subStr存在于字符创str中,输出它的起点,否则输出不存在。
#include "SuffixTree.h" #include#include using namespace std;int main() { string str="avnaoihvnjovsdvaeveavsfvwevfwa vafjoajv aoja aojfoaj afowajop afoajv afjoajo csvweavefvs fjwojvwoajv haovpjvowj ajovwjavo aojvowajv ajvoajv vnaojv vfdvdfvavaewvavaisadnvioenvoehnvPIavnaoihvnjovsdvaeveavsfvwevfwa vafjoajv aoja aojfoaj afowajop afoajv afjoajo csvweavefvs fjwojvwoajv haovpjvowj ajovwjavo aojvowajv ajvoajv vnaojv vfdvdfvavaewvavaisadnvioenvoehnvPI"; string subStr=" vafjoajv aoja aojfoaj afowajop afoajv afjoajo csvweavefvs fjwojvwoajv haov"; SuffixTree* pTree = CreateSuffixTree(str); int existFlag = FindSubString(pTree,subStr); if(existFlag>=0) cout<<"Sub string starts from "< 上述内容就是如何编写完整优化的SuffixTree代码,你们学到知识或技能了吗?如果还想学到更多技能或者丰富自己的知识储备,欢迎关注创新互联行业资讯频道。
本文题目:如何编写完整优化的SuffixTree代码
文章网址:http://scyanting.com/article/psgsgi.html