如何用python计算levenshteindistance_字符串相似度算法 levenshtein distance 编辑距离算法…

参考:http://www.merriampark.com/ld.htm#WHATIS

http://en.wikipedia.org/wiki/Levenshtein_distance

* Java

* C++

* Visual Basic

* PythonJava

public class Distance {

//****************************

// Get minimum of three values

//****************************

private int Minimum (int a, int b, int c) {

int mi;

mi = a;

if (b < mi) {

mi = b;

}

if (c < mi) {

mi = c;

}

return mi;

}

//*****************************

// Compute Levenshtein distance

//*****************************

public int LD (String s, String t) {

int d[][]; // matrix

int n; // length of s

int m; // length of t

int i; // iterates through s

int j; // iterates through t

char s_i; // ith character of s

char t_j; // jth character of t

int cost; // cost

// Step 1

n = s.length ();

m = t.length ();

if (n == 0) {

return m;

}

if (m == 0) {

return n;

}

d = new int[n+1][m+1];

// Step 2

for (i = 0; i <= n; i++) {

d[i][0] = i;

}

for (j = 0; j <= m; j++) {

d[0][j] = j;

}

// Step 3

for (i = 1; i <= n; i++) {

s_i = s.charAt (i - 1);

// Step 4

for (j = 1; j <= m; j++) {

t_j = t.charAt (j - 1);

// Step 5

if (s_i == t_j) {

cost = 0;

}

else {

cost = 1;

}

// Step 6

d[i][j] = Minimum (d[i-1][j]+1, d[i][j-1]+1, d[i-1][j-1] + cost);

}

}

// Step 7

return d[n][m];

}

}

C++

In C++, the size of an array must be a constant, and this code fragment causes an error at compile time:

int sz = 5;

int arr[sz];

This limitation makes the following C++ code slightly more complicated than it would be if the matrix could simply be declared as a two-dimensional array, with a size determined at run-time.

In C++ it's more idiomatic to use the System Template Library's vector class, as Anders Sewerin Johansen has done in an alternative C++ implementation.

Here is the definition of the class (distance.h):

class Distance

{

public:

int LD (char const *s, char const *t);

private:

int Minimum (int a, int b, int c);

int *GetCellPointer (int *pOrigin, int col, int row, int nCols);

int GetAt (int *pOrigin, int col, int row, int nCols);

void PutAt (int *pOrigin, int col, int row, int nCols, int x);

};

Here is the implementation of the class (distance.cpp):

#include "distance.h"

#include

#include

//****************************

// Get minimum of three values

//****************************

int Distance::Minimum (int a, int b, int c)

{

int mi;

mi = a;

if (b < mi) {

mi = b;

}

if (c < mi) {

mi = c;

}

return mi;

}

//**************************************************

// Get a pointer to the specified cell of the matrix

//**************************************************

int *Distance::GetCellPointer (int *pOrigin, int col, int row, int nCols)

{

return pOrigin + col + (row * (nCols + 1));

}

//*****************************************************

// Get the contents of the specified cell in the matrix

//*****************************************************

int Distance::GetAt (int *pOrigin, int col, int row, int nCols)

{

int *pCell;

pCell = GetCellPointer (pOrigin, col, row, nCols);

return *pCell;

}

//*******************************************************

// Fill the specified cell in the matrix with the value x

//*******************************************************

void Distance::PutAt (int *pOrigin, int col, int row, int nCols, int x)

{

int *pCell;

pCell = GetCellPointer (pOrigin, col, row, nCols);

*pCell = x;

}

//*****************************

// Compute Levenshtein distance

//*****************************

int Distance::LD (char const *s, char const *t)

{

int *d; // pointer to matrix

int n; // length of s

int m; // length of t

int i; // iterates through s

int j; // iterates through t

char s_i; // ith character of s

char t_j; // jth character of t

int cost; // cost

int result; // result

int cell; // contents of target cell

int above; // contents of cell immediately above

int left; // contents of cell immediately to left

int diag; // contents of cell immediately above and to left

int sz; // number of cells in matrix

// Step 1

n = strlen (s);

m = strlen (t);

if (n == 0) {

return m;

}

if (m == 0) {

return n;

}

sz = (n+1) * (m+1) * sizeof (int);

d = (int *) malloc (sz);

// Step 2

for (i = 0; i <= n; i++) {

PutAt (d, i, 0, n, i);

}

for (j = 0; j <= m; j++) {

PutAt (d, 0, j, n, j);

}

// Step 3

for (i = 1; i <= n; i++) {

s_i = s[i-1];

// Step 4

for (j = 1; j <= m; j++) {

t_j = t[j-1];

// Step 5

if (s_i == t_j) {

cost = 0;

}

else {

cost = 1;

}

// Step 6

above = GetAt (d,i-1,j, n);

left = GetAt (d,i, j-1, n);

diag = GetAt (d, i-1,j-1, n);

cell = Minimum (above + 1, left + 1, diag + cost);

PutAt (d, i, j, n, cell);

}

}

// Step 7

result = GetAt (d, n, m, n);

free (d);

return result;

}

Visual Basic

'*******************************

'*** Get minimum of three values

'*******************************

Private Function Minimum(ByVal a As Integer, _

ByVal b As Integer, _

ByVal c As Integer) As Integer

Dim mi As Integer

mi = a

If b < mi Then

mi = b

End If

If c < mi Then

mi = c

End If

Minimum = mi

End Function

'********************************

'*** Compute Levenshtein Distance

'********************************

Public Function LD(ByVal s As String, ByVal t As String) As Integer

Dim d() As Integer ' matrix

Dim m As Integer ' length of t

Dim n As Integer ' length of s

Dim i As Integer ' iterates through s

Dim j As Integer ' iterates through t

Dim s_i As String ' ith character of s

Dim t_j As String ' jth character of t

Dim cost As Integer ' cost

' Step 1

n = Len(s)

m = Len(t)

If n = 0 Then

LD = m

Exit Function

End If

If m = 0 Then

LD = n

Exit Function

End If

ReDim d(0 To n, 0 To m) As Integer

' Step 2

For i = 0 To n

d(i, 0) = i

Next i

For j = 0 To m

d(0, j) = j

Next j

' Step 3

For i = 1 To n

s_i = Mid$(s, i, 1)

' Step 4

For j = 1 To m

t_j = Mid$(t, j, 1)

' Step 5

If s_i = t_j Then

cost = 0

Else

cost = 1

End If

' Step 6

d(i, j) = Minimum(d(i - 1, j) + 1, d(i, j - 1) + 1, d(i - 1, j - 1) + cost)

Next j

Next i

' Step 7

LD = d(n, m)

Erase d

End Function

#!/user/bin/env python

# -*- coding: utf-8 -*-

class arithmetic():

def __init__(self):

pass

''' 【编辑距离算法】 【levenshtein distance】 【字符串相似度算法】 '''

def levenshtein(self,first,second):

if len(first) > len(second):

first,second = second,first

if len(first) == 0:

return len(second)

if len(second) == 0:

return len(first)

first_length = len(first) + 1

second_length = len(second) + 1

distance_matrix = [range(second_length) for x in range(first_length)]

#print distance_matrix

for i in range(1,first_length):

for j in range(1,second_length):

deletion = distance_matrix[i-1][j] + 1

insertion = distance_matrix[i][j-1] + 1

substitution = distance_matrix[i-1][j-1]

if first[i-1] != second[j-1]:

substitution += 1

distance_matrix[i][j] = min(insertion,deletion,substitution)

print distance_matrix

return distance_matrix[first_length-1][second_length-1]

if __name__ == "__main__":

arith = arithmetic()

print arith.levenshtein('GUMBOsdafsadfdsafsafsadfasfadsfasdfasdfs','GAMBOL00000000000dfasfasfdafsafasfasdfdsa'

THE END
< <上一篇
下一篇>>