import core.stdc.stdlib: alloca, malloc, free;
import core.exception: OutOfMemoryError;
int editDistanceFast(T, int STATICLEN=200)(T[] arr1, T[] arr2) {
/*
Levenshtein.c v2003-05-10
Python extension computing Levenshtein distances, string similarities,
median strings and other goodies.
Copyright (C) 2002-2003 David Necas (Yeti) <yeti@physics.muni.cz>.
Translated to D language, shortened, converted to a template, and improved
its memory allocation, by leonardo maffi, V.1.0, May 6 2008
This program is free software; you can redistribute it and/or modify it
under the terms of the GNU General Public License as published by the Free
Software Foundation; either version 2 of the License, or (at your option)
any later version.
This program is distributed in the hope that it will be useful, but WITHOUT
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
*/
T* array1 = arr1.ptr;
size_t len1 = arr1.length;
T* array2 = arr2.ptr;
size_t len2 = arr2.length;
// strip common prefix
while (len1 > 0 && len2 > 0 && *array1 == *array2) {
len1--;
len2--;
array1++;
array2++;
}
// strip common suffix
while (len1 > 0 && len2 > 0 && array1[len1-1] == array2[len2-1]) {
len1--;
len2--;
}
// catch trivial cases
if (len1 == 0)
return len2;
if (len2 == 0)
return len1;
// make the inner cycle (i.e. array2) the longer one
if (len1 > len2) {
size_t nx = len1;
T* sx = array1;
len1 = len2;
len2 = nx;
array1 = array2;
array2 = sx;
}
// check len1 == 1 separately
if (len1 == 1) {
static if (is(T == char)) {
return len2 - (memchr(array2, *array1, len2) !is null);
} else {
foreach (x; array2[0 .. len2])
if (x == *array1)
return len2 - 1;
return len2;
}
}
len1++;
len2++;
size_t half = len1 >> 1;
// initalize first row
size_t* row; // we only need to keep one row of costs
// if the input string is small we use static memory to avoid heap activity
// tango uses solves a similar problem in a different way
bool dofree = false;
static size_t[STATICLEN] staticrow = void;
if (len2 <= STATICLEN)
row = staticrow.ptr;
else {
if (len2 < 4_000) { // about 16kb on 32-bit CPUs or 32kb on 64-bit CPUs
// if len2 is small, then we try to allocate from the stack
row = cast(size_t*)alloca(len2 * size_t.sizeof);
if (!row) {
// we try again on the C heap
row = cast(size_t*)malloc(len2 * size_t.sizeof);
if (row == null)
throw new OutOfMemoryError(__FILE__, __LINE__);
else
// we deallocate only mememory from the C heap
dofree = true;
}
} else {
row = cast(size_t*)malloc(len2 * size_t.sizeof);
if (row == null)
throw new OutOfMemoryError(__FILE__, __LINE__);
else
// we deallocate only mememory from the C heap
dofree = true;
}
}
size_t* end = row + len2 - 1;
for (size_t i = 0; i < len2 - half; i++)
row[i] = i;
// go through the matrix and compute the costs. Yes, this is an extremely
// obfuscated version, but also extremely memory-conservative and
// relatively fast.
// We don't have to scan two corner triangles (of size len1/2)
// in the matrix because no best path can go throught them. note this
// breaks when len1 == len2 == 2 so the memchr() special case above is
// necessary
row[0] = len1 - half - 1;
for (size_t i = 1; i < len1; i++) {
size_t* p;
T t1 = array1[i - 1];
T* t2p;
size_t D, x;
// skip the upper triangle
if (i >= len1 - half) {
size_t offset = i - (len1 - half);
size_t c3;
t2p = array2 + offset;
p = row + offset;
c3 = *(p++) + (t1 != *(t2p++));
x = *p;
x++;
D = x;
if (x > c3)
x = c3;
*(p++) = x;
}
else {
p = row + 1;
t2p = array2;
D = x = i;
}
// skip the lower triangle
if (i <= half + 1)
end = row + len2 + i - half - 2;
// main loop
while (p <= end) {
size_t c3 = --D + (t1 != *(t2p++));
x++;
if (x > c3)
x = c3;
D = *p;
D++;
if (x > D)
x = D;
*(p++) = x;
}
// lower triangle sentinel
if (i <= half) {
size_t c3 = --D + (t1 != *t2p);
x++;
if (x > c3)
x = c3;
*p = x;
}
}
auto result = *end;
if (dofree)
free(row);
return result;
}
// ----------------------------
import std.stdio, std.string, std.date, std.algorithm;
//import core.memory: GC;
void main() {
auto s1 = "kitten";
auto s2 = "sitting";
foreach (i; 2 .. 13) {
immutable int n = 2 ^^ i;
auto data1 = s1.repeat(n);
auto data2 = s2.repeat(n);
auto t0 = getUTCtime();
int d = editDistanceFast(data1, data2);
//int d = levenshteinDistance(data1, data2); // phobos
//int d = levenshteinDistance(cast(ubyte[])data1, cast(ubyte[])data2); // phobos
auto t1 = getUTCtime();
//GC.collect();
writeln(n, " ", d, " ", (t1 - t0) / cast(double)ticksPerSecond, );
}
}