-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathtemplater.c
113 lines (93 loc) · 4.49 KB
/
templater.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
#include <Python.h>
/*
Original code from:
author='Adrian Holovaty',
author_email='[email protected]',
http://templatemaker.googlecode.com/
longest_match() and longest_match_shifter()
Returns the length of the longest common substring (LCS) in the strings
a and b.
Sets a_offset to the index of a where the LCS begins.
Sets b_offset to the index of b where the LCS begins.
If there is NO common substring, it returns 0 and sets both
a_offset and b_offset to -1.
The strings do not have to be equal length.
The algorithm works by comparing one character at a time and "shifting" the
strings so that different characters are compared. For example, given
a="ABC" and b="DEF", picture these alignments:
(Shift a to the right)
-------------------------------------------------------
a | ABC ABC ABC
b | DEF DEF DEF
shift index | 0 1 2
possible LCS | 3 2 1
comparisons | AD, BE, CF AE, BF AF
(Shift b to the right)
-------------------------------------------------------
| ABC ABC ABC
| DEF DEF DEF
shift index | 0 1 2
possible LCS | 3 2 1
comparisons | AD, BE, CF BD, CE CD
The algorithm short circuits based on the best_size found so far. For example,
given a="ABC" and b="ABC", the first cycle of comparisons (AA, BB, CC) would
result in a best_size=3. Because the algorithm starts with zero shift (i.e.,
it starts with the highest possible LCS) and adds 1 to the shift index each
time through, it can safely exit without doing any more comparisons.
This algorithm is O^(m + m-1 + m-2 + ... + 1 + n + n-1 + n-2 + ... + 1), where
m and n are the length of the two strings. Due to short circuiting, the
algorithm could potentially finish after the very
first set of comparisons. The algorithm is slowest when the LCS is smallest,
and the algorithm is fastest when the LCS is biggest.
longest_match_shifter() performs "one side" of the shift -- e.g., "Shift a to
the right" in the above illustration. longest_match() simply calls
longest_match_shifter() twice, flipping the strings.
*/
int longest_match_shifter(char* a, char* b, int a_start, int a_end, int b_start, int b_end, int best_size, int* a_offset, int* b_offset) {
int i, j, k;
unsigned int current_size;
for (i = b_start, current_size = 0; i < b_end; i++, current_size = 0) { // i is b starting index.
if (best_size >= b_end - i) break; // Short-circuit. See comment above.
for (j = i, k = a_start; k < a_end && j < b_end; j++, k++) { // k is index of a, j is index of b.
if (a[k] == b[j]) {
if (++current_size > best_size) {
best_size = current_size;
*a_offset = k - current_size + 1;
*b_offset = j - current_size + 1;
}
}
else {
current_size = 0;
}
}
}
return best_size;
}
// a_offset and b_offset are relative to the *whole* string, not the substring
// (as defined by a_start and a_end).
// a_end and b_end are (the last index + 1).
int longest_match(char* a, char* b, int a_start, int a_end, int b_start, int b_end, int* a_offset, int* b_offset) {
unsigned int best_size;
*a_offset = -1;
*b_offset = -1;
best_size = longest_match_shifter(a, b, a_start, a_end, b_start, b_end, 0, a_offset, b_offset);
best_size = longest_match_shifter(b, a, b_start, b_end, a_start, a_end, best_size, b_offset, a_offset);
return best_size;
}
static PyObject * function_longest_match(PyObject *self, PyObject *args) {
char* a;
char* b;
int a_offset, b_offset, lena, lenb;
unsigned int best_size;
if (!PyArg_ParseTuple(args, "s#s#", &a, &lena, &b, &lenb))
return NULL;
best_size = longest_match(a, b, 0, lena, 0, lenb, &a_offset, &b_offset);
return Py_BuildValue("(iii)", best_size, a_offset, b_offset);
}
static PyMethodDef ModuleMethods[] = {
{"longest_match", function_longest_match, METH_VARARGS, "Given two strings, determines the longest common substring and returns a tuple of (best_size, a_offset, b_offset)."},
{NULL, NULL, 0, NULL} // sentinel
};
PyMODINIT_FUNC init_templater(void) {
(void) Py_InitModule("_templater", ModuleMethods);
}