Algorithms Library Toolkit
A toolkit for algorithms, especially for algorithms on formal languages
ExperimentalCompactSuffixAutomatonConstruct.h
Go to the documentation of this file.
1
9#include <string/LinearString.h>
10
11namespace stringology {
12
13namespace indexing {
14
16
17 template <class SymbolType>
18 class CompactSuffixAutomatonConstructInt {
19 int nil;
20 int newVertexNumber;//hodnota použitá při vytvoření nového vrcholu
21 int e;
22 int T;//T == ┴
23 int source;
24 int sink;
25 int m; //velikost abecedy
26 ext::map<int,SymbolType> w; //slovo, kde na indexech -m až -1 je abeceda
27 int wLen;//délka slova w
28 SymbolType endChar;//znak, kterým končí řetězec - unikátní v celém řetězci. V článku $
29
30 // vrchol popis-hrany koncový-vrchol
31 // v v v
34 ext::map<int,int> length;
35
36 int createNode() {
37 edges.insert({newVertexNumber,ext::map<ext::pair<int,int>,int>()});
38 return newVertexNumber++;
39 }
40
41 void createEdge(int fromVertex, int first, int second, int toVertex) {
42 edges.at(fromVertex).insert({{first, second},toVertex});
43 }
44
45 //hledání w[k]-edge. Použito na několika místech v algoritmu.
46 //char c je w[k]
47 void wkEdge(int s, const SymbolType & c, int & sc, int & kc, int & pc) {
48 for ( const std::pair < const ext::pair < int, int >, int > & edge : edges.at ( s ) ) {
49 if ( w.at ( edge.first.first ) == c) {
50 sc = edge.second;
51 kc = edge.first.first;
52 pc = edge.first.second;
53 return;
54 }
55
56 }
57 throw "neniHrana";
58 }
59
60 //pomocná funkce, která reprezentuje řádek 3 ve funkci split_edge()
61 void replaceTheEdgeBy(int s, int k, int p, int sc, int kc, int pc, int r) {
62 edges.at(s).erase({kc,pc});
63
64 createEdge(s,kc,kc+p-k,r);
65 createEdge(r,kc+p-k+1,pc,sc);
66 }
67
68 //pomocná funkce ve funkci redirect_edge řádek 2
69 void replaceTheEdgeByEdge(int s, int k, int p, int kc, int pc, int r) {
70 edges.at(s).erase({kc,pc});
71
72 createEdge(s,kc,kc+p-k,r);
73 }
74
75 //pomocná funkce ve funkci separate_node řádek 9
76 void replaceThewkEdge(int s, int k, int p, int rc) {
77 int kcc;
78 int scc;
79 int pcc;
80 wkEdge(s,w.at(k),scc,kcc,pcc);
81
82 edges.at(s).erase({kcc,pcc});
83 createEdge(s,k,p,rc);
84 }
85
86 //pomocná funkce, která reprezentuje řádek 4 ve funkci check_end_point
87 bool thereIsACedgeFromS(int s, const SymbolType & c) {
88 //procházím hrany vrcholu s a hledám, jestli se první znak hrany rovná c
89 for ( const std::pair < const ext::pair < int, int >, int > & edge : edges.at ( s ) )
90 if ( w.at ( edge.first.first ) == c )
91 return true;
92
93 return false;
94 }
95
96 //tvoří kopii vrcholu. (Překopíruje hrany)
97 int duplicationOf(int sc) {
98 int rc = createNode();
99
100 for ( const std::pair < const ext::pair < int, int >, int > & edge : edges.at ( sc ) )
101 createEdge ( rc, edge.first.first, edge.first.second, edge.second );
102
103 return rc;
104 }
105
106 //konec pomocných funkcí. Dále je co nejpřesnější přepis pseudokodu
107
108 bool check_end_point(int s,int k,int p, const SymbolType & c) {
109 if(k<= p) {
110 int sc;
111 int kc;
112 int pc;//s' k' p'
113 wkEdge(s,w.at(k),sc,kc,pc);
114 return (c == w.at(kc+p-k+1));
115 }
116 return thereIsACedgeFromS(s,c);
117 }
118
119 int split_edge(int s,int k,int p) {
120 int sc;
121 int kc;
122 int pc;//s' k' p'
123 wkEdge(s,w.at(k),sc,kc,pc);
124
125 int r = createNode();
126 replaceTheEdgeBy(s,k,p,sc,kc,pc,r);
127
128 int len = length.at(s);
129 if(len == INT_MAX)
130 len = e;
131
132 length[r] = len + (p-k+1);
133 return r;
134 }
135
136 ext::pair<int,int> canonize(int s,int k,int p) { //p se může rovnat e
137 if(k>p)
138 return ext::make_pair ( s, k );
139
140 int sc;
141 int kc;
142 int pc;
143 wkEdge(s,w.at(k),sc,kc,pc);
144
145 if(pc == INT_MAX)
146 pc = e;
147
148 while(pc-kc <= p-k) {
149 k = k+pc-kc+1;
150 s = sc;
151 if(k<=p) {
152 wkEdge(s,w.at(k),sc,kc,pc);
153 if(pc == INT_MAX)
154 pc = e;
155 }
156 }
157 return ext::make_pair ( s, k );
158 }
159
160 void redirect_edge(int s, int k, int p, int r) {
161 int sc;
162 int kc;
163 int pc;
164 wkEdge(s,w.at(k),sc,kc,pc);
165
166 replaceTheEdgeByEdge(s,k,p,kc,pc,r);
167 }
168
169 int extension(int s, int k, int p) {
170 if(k > p)
171 return s;
172
173 int kc;
174 int pc;
175 int sc;
176 wkEdge(s,w.at(k),sc,kc,pc);
177 return sc;
178 }
179
180 ext::pair<int,int> separate_node(int s,int k,int p) { //p==e
181 int sc;
182 int kc;
183 ext::tie ( sc, kc ) = canonize(s,k,p);
184 if ( kc <= p)
185 return ext::make_pair ( sc, kc );
186
187 int len = length.at(s);
188 if(len == INT_MAX)
189 len = e;
190
191 int lensc = length.at(sc);
192 if(lensc == INT_MAX)
193 lensc = e;
194
195 if(lensc == len+(p-k+1))
196 return ext::make_pair ( sc, kc );
197
198 int rc = duplicationOf(sc);
199 suf[rc] = suf.at(sc); suf[sc] = rc;
200 length[rc] = len +(p-k+1);
201 do {
202 replaceThewkEdge(s,k,p,rc);
203
204 ext::tie ( s, k ) = canonize(suf.at(s),k,p-1);
205 } while ( ext::make_pair ( sc, kc ) == canonize(s,k,p));
206
207 return ext::make_pair ( rc, p+1 );
208 }
209
210 ext::pair<int,int> update(int s,int k,int p) {
211 const SymbolType & c = w.at(p);
212 int oldr = nil;
213
214 int sc = nil; // v článku není s' inicializované, ale je potřeba to na něco inicializovat, protože níže dochází k porovnávní
215 int r = -1; // not initialized in the paper. The first comparison of sc must fail and therefore variable r will neverbe read with its initial value
216 while(!check_end_point(s,k,p-1,c)) {
217 if(k<= p-1) {
218 if(sc == extension(s,k,p)) {
219 redirect_edge(s,k,p-1,r);
220 ext::tie ( s, k ) = canonize(suf.at(s),k,p-1);
221 continue;
222 } else {
223 sc = extension(s,k,p);
224 r = split_edge(s,k,p-1);
225 }
226 } else
227 r = s;
228
229 createEdge(r,p,INT_MAX,sink); //p == e vždy. p je nahrazeno INT_MAX
230
231 if(oldr != nil)
232 suf[oldr] = r;
233 oldr = r;
234
235 ext::tie ( s, k ) = canonize(suf.at(s),k,p-1);
236 }
237
238 if(oldr != nil)
239 suf[oldr] = s;
240 return separate_node(s,k,p);
241 }
242
243 public:
244 CompactSuffixAutomatonConstructInt ( const /*string::LinearString*/ext::vector < SymbolType > & subject ) : wLen ( subject.size ( ) ), endChar ( subject.back ( ) ) {
245 nil = INT_MIN;
246 newVertexNumber = -1;
247 ext::set < SymbolType > alphabet ( subject.begin ( ), subject.end ( ) );
248
249 for ( int i = 0; i < wLen; ++ i )
250 w.insert ( std::make_pair ( i + 1, subject [ i ] ) );
251
252 m = alphabet.size();
253
254 int i = -1;
255 for ( const SymbolType & symbol : alphabet )//abecedu je potřeba vyskládat na záporné indexy od -1 do -m
256 w.insert ( std::make_pair ( i --, symbol ) );
257 }
258
259 void startConstruction() {
260 T = createNode();
261 source = createNode();
262 sink = createNode();
263 for(int j = 1;j<=m;j++)
264 createEdge(T,-j,-j,source);
265 suf[source] = T;
266 length[source] = 0; length[T] = -1;
267 e = 0; length[sink] = INT_MAX;
268
269 int s = source;
270 int k = 1;
271 int i = 0;
272 do {
273 i = i+1; e = i;
274 ext::tie ( s, k ) = update ( s, k, i );
275 //print ( i );
276 } while (w.at(i) != endChar);
277 }
278
279 void print ( int i ) const {
280 std::cout << "Krok " << i << std::endl;
281 for(int pi = 0;pi<newVertexNumber;pi++) {
282 std::cout << "Vrchol " << pi << std::endl;
283 for(ext::map<ext::pair<int,int>,int>::const_iterator it = edges.at(pi).begin(); it != edges.at(pi).end(); ++it) {
284 std::cout << it->first.first << " " << it->first.second;
285 std::cout << " -- ";
286 for(int j = it->first.first;j<=it->first.second;j++) {
287 if(j>i) break;
288 std::cout << w.find(j)->second;
289 }
290 std::cout << " --> " << it->second << std::endl;
291 }
292 }
293
294 std::cout << "-------------" << std::endl;
295
296 std::cout << "Delky" << std::endl;
297 for(int pi = 0;pi<newVertexNumber;pi++)
298 std::cout << "Vrchol " << pi << " " << length.find(pi)->second << std::endl;
299
300 std::cout << "Suffix links" << std::endl;
301 for(int pi = 0;pi<newVertexNumber;pi++)
302 std::cout << "Vrchol " << pi << " " << suf.find(pi)->second << std::endl;
303 }
304
305 void changeIntMaxToE() {
306 for(auto it = edges.begin();it!=edges.end();++it)
307 for(auto it2 = it->second.begin();it2!=it->second.end();++it2)
308 if(it2->first.second == INT_MAX) {
309 int k = it2->first.first;
310 int r = it2->second;
311
312 it->second.erase(it2);
313 it->second.insert({{k,e},r});
314 it2 = it->second.begin();
315 }
316 }
317
318 const ext::map<int,ext::map<ext::pair<int,int>,int>> & getEdges ( ) const {
319 return edges;
320 }
321 };
322
323public:
325 CompactSuffixAutomatonConstructInt < DefaultSymbolType > algo (subject.getContent ( ) );
326 algo.startConstruction();
327 algo.changeIntMaxToE();
328 //algo.print ( subject.getContent().size ( ) );
329
331 res.setString(subject.getContent ( ) );
332 res.setNumberOfVertices(algo.getEdges().size()-1);
333
334 for ( const auto & edge : algo.getEdges ( ) )
335 if ( edge.first != -1 )
336 res.insertVertex ( edge.first, edge.second );
337
338 return res;
339 }
340
341 template < class SymbolType >
343 SymbolType endSymbol = common::createUnique ( alphabet::EndSymbol::instance < SymbolType > ( ), subject.getAlphabet ( ) );
344 ext::vector < SymbolType > content = subject.getContent ( );
345 content.push_back ( endSymbol );
346
347 CompactSuffixAutomatonConstructInt < SymbolType > algo ( content );
348 algo.startConstruction();
349 algo.changeIntMaxToE();
350 //algo.print ( subject.getContent().size ( ) );
351
353 res.setString ( content );
354 res.setNumberOfVertices(algo.getEdges().size()-1);
355
356 for ( const auto & edge : algo.getEdges ( ) )
357 if ( edge.first != -1 )
358 res.insertVertex ( edge.first, edge.second );
359
360 return res;
361 }
362};
363
364} /* namespace indexing */
365
366} /* namespace stringology */
Class extending the map class from the standard library. Original reason is to allow printing of the ...
Definition: map.hpp:48
R & at(const T &key, R &defaultValue)
Definition: map.hpp:163
auto begin() &
Inherited behavior of begin for non-const instance.
Definition: map.hpp:185
std::pair< iterator, bool > insert(const T &key, const R &value)
Insert variant with explicit key and value parameters.
Definition: map.hpp:118
auto end() &
Inherited behavior of end for non-const instance.
Definition: map.hpp:215
size_t erase(const K &key)
Erase by key of arbitrary type.
Definition: map.hpp:340
Class extending the pair class from the standard library. Original reason is to allow printing of the...
Definition: pair.hpp:43
Definition: set.hpp:44
Class extending the vector class from the standard library. Original reason is to allow printing of t...
Definition: vector.hpp:45
auto begin() &
Inherited behavior of begin for non-const instance.
Definition: vector.hpp:125
auto end() &
Inherited behavior of end for non-const instance.
Definition: vector.hpp:155
Definition: CompactSuffixAutomatonTerminatingSymbol.h:33
Definition: LinearStringTerminatingSymbol.h:30
const ext::vector< DefaultSymbolType > & getContent() const
Definition: LinearStringTerminatingSymbol.cpp:42
Linear string.
Definition: LinearString.h:57
const ext::set< SymbolType > & getAlphabet() const &
Definition: LinearString.h:103
const ext::vector< SymbolType > & getContent() const &
Definition: LinearString.h:238
Definition: ExperimentalCompactSuffixAutomatonConstruct.h:15
static indexes::stringology::CompactSuffixAutomatonTerminatingSymbol< SymbolType > construct(const string::LinearString< SymbolType > &subject)
Definition: ExperimentalCompactSuffixAutomatonConstruct.h:342
static indexes::stringology::CompactSuffixAutomatonTerminatingSymbol< DefaultSymbolType > construct(const string::LinearStringTerminatingSymbol &subject)
Definition: ExperimentalCompactSuffixAutomatonConstruct.h:324
Definition: BarSymbol.cpp:12
p second
Definition: ToRegExpAlgebraic.h:126
int i
Definition: AllEpsilonClosure.h:118
typename T::SymbolType SymbolType
Definition: ReachableStates.h:176
return res
Definition: MinimizeByPartitioning.h:145
T createUnique(T object, const Alphabets &... alphabets)
Definition: createUnique.hpp:46
Definition: CapacityEdge.hpp:18
constexpr tuple< Elements &... > tie(Elements &... args) noexcept
Helper of extended tuple of references construction. The tuple is constructed to reffer to values in ...
Definition: tuple.hpp:218
ostream cout
constexpr auto make_pair(T1 &&x, T2 &&y)
Definition: pair.hpp:79
Definition: ArithmeticCompression.h:18