codepad
[
create a new paste
]
login
|
about
Language:
C
C++
D
Haskell
Lua
OCaml
PHP
Perl
Plain Text
Python
Ruby
Scheme
Tcl
import core.stdc.stdlib: alloca, malloc, free; import core.exception: OutOfMemoryError; int editDistanceFast(T, int STATICLEN=200)(T[] arr1, T[] arr2) { /* Levenshtein.c v2003-05-10 Python extension computing Levenshtein distances, string similarities, median strings and other goodies. Copyright (C) 2002-2003 David Necas (Yeti) <yeti@physics.muni.cz>. Translated to D language, shortened, converted to a template, and improved its memory allocation, by leonardo maffi, V.1.0, May 6 2008 This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. */ T* array1 = arr1.ptr; size_t len1 = arr1.length; T* array2 = arr2.ptr; size_t len2 = arr2.length; // strip common prefix while (len1 > 0 && len2 > 0 && *array1 == *array2) { len1--; len2--; array1++; array2++; } // strip common suffix while (len1 > 0 && len2 > 0 && array1[len1-1] == array2[len2-1]) { len1--; len2--; } // catch trivial cases if (len1 == 0) return len2; if (len2 == 0) return len1; // make the inner cycle (i.e. array2) the longer one if (len1 > len2) { size_t nx = len1; T* sx = array1; len1 = len2; len2 = nx; array1 = array2; array2 = sx; } // check len1 == 1 separately if (len1 == 1) { static if (is(T == char)) { return len2 - (memchr(array2, *array1, len2) !is null); } else { foreach (x; array2[0 .. len2]) if (x == *array1) return len2 - 1; return len2; } } len1++; len2++; size_t half = len1 >> 1; // initalize first row size_t* row; // we only need to keep one row of costs // if the input string is small we use static memory to avoid heap activity // tango uses solves a similar problem in a different way bool dofree = false; static size_t[STATICLEN] staticrow = void; if (len2 <= STATICLEN) row = staticrow.ptr; else { if (len2 < 4_000) { // about 16kb on 32-bit CPUs or 32kb on 64-bit CPUs // if len2 is small, then we try to allocate from the stack row = cast(size_t*)alloca(len2 * size_t.sizeof); if (!row) { // we try again on the C heap row = cast(size_t*)malloc(len2 * size_t.sizeof); if (row == null) throw new OutOfMemoryError(__FILE__, __LINE__); else // we deallocate only mememory from the C heap dofree = true; } } else { row = cast(size_t*)malloc(len2 * size_t.sizeof); if (row == null) throw new OutOfMemoryError(__FILE__, __LINE__); else // we deallocate only mememory from the C heap dofree = true; } } size_t* end = row + len2 - 1; for (size_t i = 0; i < len2 - half; i++) row[i] = i; // go through the matrix and compute the costs. Yes, this is an extremely // obfuscated version, but also extremely memory-conservative and // relatively fast. // We don't have to scan two corner triangles (of size len1/2) // in the matrix because no best path can go throught them. note this // breaks when len1 == len2 == 2 so the memchr() special case above is // necessary row[0] = len1 - half - 1; for (size_t i = 1; i < len1; i++) { size_t* p; T t1 = array1[i - 1]; T* t2p; size_t D, x; // skip the upper triangle if (i >= len1 - half) { size_t offset = i - (len1 - half); size_t c3; t2p = array2 + offset; p = row + offset; c3 = *(p++) + (t1 != *(t2p++)); x = *p; x++; D = x; if (x > c3) x = c3; *(p++) = x; } else { p = row + 1; t2p = array2; D = x = i; } // skip the lower triangle if (i <= half + 1) end = row + len2 + i - half - 2; // main loop while (p <= end) { size_t c3 = --D + (t1 != *(t2p++)); x++; if (x > c3) x = c3; D = *p; D++; if (x > D) x = D; *(p++) = x; } // lower triangle sentinel if (i <= half) { size_t c3 = --D + (t1 != *t2p); x++; if (x > c3) x = c3; *p = x; } } auto result = *end; if (dofree) free(row); return result; } // ---------------------------- import std.stdio, std.string, std.date, std.algorithm; //import core.memory: GC; void main() { auto s1 = "kitten"; auto s2 = "sitting"; foreach (i; 2 .. 13) { immutable int n = 2 ^^ i; auto data1 = s1.repeat(n); auto data2 = s2.repeat(n); auto t0 = getUTCtime(); int d = editDistanceFast(data1, data2); //int d = levenshteinDistance(data1, data2); // phobos //int d = levenshteinDistance(cast(ubyte[])data1, cast(ubyte[])data2); // phobos auto t1 = getUTCtime(); //GC.collect(); writeln(n, " ", d, " ", (t1 - t0) / cast(double)ticksPerSecond, ); } }
Private
[
?
]
Run code
Submit