Algorithm Implementation/Strings/Dice's coefficient
< Algorithm Implementation < StringsDice's coefficient measures how similar a set and another set are. It can be used to measure how similar two strings are in terms of the number of common bigrams (a bigram is a pair of adjacent letters in the string). Below we give implementations of Dice's coefficient of two strings in different programming languages.
C++
/*
* dice coefficient = bigram overlap * 2 / bigrams in a + bigrams in b
* (C) 2007 Francis Tyers
* Modifications made by Stefan Koshiw 2010
* Now it outputs values [0..1]
* Released under the terms of the GNU GPL.
*/
float dice_coefficient(wstring string1, wstring string2)
{
set<string> string1_bigrams;
set<string> string2_bigrams;
//base case
if(string1.length() == 0 || string2.length() == 0)
{
return 0;
}
for(unsigned int i = 0; i < (string1.length() - 1); i++) { // extract character bigrams from string1
string1_bigrams.insert(string1.substr(i, 2));
}
for(unsigned int i = 0; i < (string2.length() - 1); i++) { // extract character bigrams from string2
string2_bigrams.insert(string2.substr(i, 2));
}
int intersection = 0;
// find the intersection between the two sets
for(set<string>::iterator IT = string2_bigrams.begin();
IT != string2_bigrams.end();
IT++)
{
intersection += string1_bigrams.count((*IT));
}
// calculate dice coefficient
int total = string1_bigrams.size() + string2_bigrams.size();
float dice = (float)(intersection * 2) / (float)total;
return dice;
}
Haskell
a naive, non-optimized version
import Data.List (intersect, nub)
diceCoefficient sa sb =
fromIntegral (2 * length (intersect agrams bgrams)) / fromIntegral (length agrams + length bgrams)
where
agrams = bigrams sa
bgrams = bigrams sb
bigrams xs = nub $ zip xs (tail xs)
Java
import java.util.Arrays;
import java.util.HashSet;
import java.util.Set;
//Note that this implementation is case-sensitive!
public static double diceCoefficient(String s1, String s2)
{
Set<String> nx = new HashSet<String>();
Set<String> ny = new HashSet<String>();
for (int i=0; i < s1.length()-1; i++) {
char x1 = s1.charAt(i);
char x2 = s1.charAt(i+1);
String tmp = "" + x1 + x2;
nx.add(tmp);
}
for (int j=0; j < s2.length()-1; j++) {
char y1 = s2.charAt(j);
char y2 = s2.charAt(j+1);
String tmp = "" + y1 + y2;
ny.add(tmp);
}
Set<String> intersection = new HashSet<String>(nx);
intersection.retainAll(ny);
double totcombigrams = intersection.size();
return (2*totcombigrams) / (nx.size()+ny.size());
}
/**
* Here's an optimized version of the dice coefficient calculation. It takes
* advantage of the fact that a bigram of 2 chars can be stored in 1 int, and
* applies a matching algorithm of O(n*log(n)) instead of O(n*n).
*
* <p>Note that, at the time of writing, this implementation differs from the
* other implementations on this page. Where the other algorithms incorrectly
* store the generated bigrams in a set (discarding duplicates), this
* implementation actually treats multiple occurrences of a bigram as unique.
* The correctness of this behavior is most easily seen when getting the
* similarity between "GG" and "GGGGGGGG", which should obviously not be 1.
*
* @param s The first string
* @param t The second String
* @return The dice coefficient between the two input strings. Returns 0 if one
* or both of the strings are {@code null}. Also returns 0 if one or both
* of the strings contain less than 2 characters and are not equal.
* @author Jelle Fresen
*/
public static double diceCoefficientOptimized(String s, String t)
{
// Verifying the input:
if (s == null || t == null)
return 0;
// Quick check to catch identical objects:
if (s == t)
return 1;
// avoid exception for single character searches
if (s.length() < 2 || t.length() < 2)
return 0;
// Create the bigrams for string s:
final int n = s.length()-1;
final int[] sPairs = new int[n];
for (int i = 0; i <= n; i++)
if (i == 0)
sPairs[i] = s.charAt(i) << 16;
else if (i == n)
sPairs[i-1] |= s.charAt(i);
else
sPairs[i] = (sPairs[i-1] |= s.charAt(i)) << 16;
// Create the bigrams for string t:
final int m = t.length()-1;
final int[] tPairs = new int[m];
for (int i = 0; i <= m; i++)
if (i == 0)
tPairs[i] = t.charAt(i) << 16;
else if (i == m)
tPairs[i-1] |= t.charAt(i);
else
tPairs[i] = (tPairs[i-1] |= t.charAt(i)) << 16;
// Sort the bigram lists:
Arrays.sort(sPairs);
Arrays.sort(tPairs);
// Count the matches:
int matches = 0, i = 0, j = 0;
while (i < n && j < m)
{
if (sPairs[i] == tPairs[j])
{
matches += 2;
i++;
j++;
}
else if (sPairs[i] < tPairs[j])
i++;
else
j++;
}
return (double)matches/(n+m);
}
Javascript
function dice_coefficient(string1, string2) {
var intersection = 0;
var length1 = string1.length - 1;
var length2 = string2.length - 1;
if(length1 < 1 || length2 < 1) return 0;
var bigrams2 = [];
for(var i = 0; i < length2; i++) {
bigrams2.push(string2.substr(i,2));
}
for(var i = 0; i < length1; i++) {
var bigram1 = string1.substr(i, 2);
for(var j = 0; j < length2; j++) {
if(bigram1 == bigrams2[j]) {
intersection++;
bigrams2[j] = null;
break;
}}}
return (2.0 * intersection) / (length1 + length2);
}
Perl
sub dice_coefficient(){
my ($str1, $str2) = @_;
return 0 if (! length $str1 || ! length $str2 );
#### bigrams using REGEX.
##my @bigrams_str1 = $str1 =~ m/ (?= (..) ) /xmsg;
##my @bigrams_str2 = $str2 =~ m/ (?= (..) ) /xmsg;
my $len1 = ( length($str1) - 1 );
for (my $i=0; $i<$len1; $i++){
push @bigrams_str1, substr($str1, $i, 2);
}
my $len2 = ( length($str2) - 1 );
for (my $i=0; $i<$len2; $i++){
push @bigrams_str2, substr($str2, $i, 2);
}
my $intersect = 0;
STR1: for my $bigram (@bigrams_str1){
for my $i (0 .. $#bigrams_str2){
next unless $bigrams_str2[$i];
if ($bigram eq $bigrams_str2[$i]){
$intersect++;
$bigrams_str2[$i] = undef;
next STR1;
}
}
}
my $combinedLength = ($#bigrams_str1+1 + $#bigrams_str2+1);
my $dice = ($intersect * 2 / $combinedLength);
return $dice;
}
Python
def dice_coefficient(a, b):
"""dice coefficient 2nt/na + nb."""
a_bigrams = set(a)
b_bigrams = set(b)
overlap = len(a_bigrams & b_bigrams)
return overlap * 2.0/(len(a_bigrams) + len(b_bigrams))
""" more orthodox and robust implementation """
def dice_coefficient(a, b):
"""dice coefficient 2nt/na + nb."""
if not len(a) or not len(b): return 0.0
if len(a) == 1: a=a+u'.'
if len(b) == 1: b=b+u'.'
a_bigram_list=[]
for i in range(len(a)-1):
a_bigram_list.append(a[i:i+2])
b_bigram_list=[]
for i in range(len(b)-1):
b_bigram_list.append(b[i:i+2])
a_bigrams = set(a_bigram_list)
b_bigrams = set(b_bigram_list)
overlap = len(a_bigrams & b_bigrams)
dice_coeff = overlap * 2.0/(len(a_bigrams) + len(b_bigrams))
return dice_coeff
""" duplicate bigrams in a word should be counted distinctly
(per discussion), otherwise 'AA' and 'AAAA' would have a
dice coefficient of 1...
"""
def dice_coefficient(a,b):
if not len(a) or not len(b): return 0.0
""" quick case for true duplicates """
if a == b: return 1.0
""" if a != b, and a or b are single chars, then they can't possibly match """
if len(a) == 1 or len(b) == 1: return 0.0
""" use python list comprehension, preferred over list.append() """
a_bigram_list = [a[i:i+2] for i in range(len(a)-1)]
b_bigram_list = [b[i:i+2] for i in range(len(b)-1)]
a_bigram_list.sort()
b_bigram_list.sort()
# assignments to save function calls
lena = len(a_bigram_list)
lenb = len(b_bigram_list)
# initialize match counters
matches = i = j = 0
while (i < lena and j < lenb):
if a_bigram_list[i] == b_bigram_list[j]:
matches += 2
i += 1
j += 1
elif a_bigram_list[i] < b_bigram_list[j]:
i += 1
else:
j += 1
score = float(matches)/float(lena + lenb)
return score
Objective C
int compare (const void * a, const void * b)
{
return ( *(int*)a - *(int*)b );
}
+(float)diceCoefficient:(NSString*)s s2:(NSString*)t
{
// Verifying the input:
if (s == nil || t == nil) return 0;
// Quick check to catch identical objects:
if (s == t) return 1;
// avoid exception for single character searches
if (s.length < 2 || t.length < 2) return 0;
// Create the bigrams for string s:
int n = s.length-1;
int sPairs[n];
for (int i = 0; i <= n; i++)
if (i == 0) sPairs[i] = [s characterAtIndex:i] << 16;
else if (i == n) sPairs[i-1] |= [s characterAtIndex:i];
else sPairs[i] = (sPairs[i-1] |= [s characterAtIndex:i]) << 16;
// Create the bigrams for string t:
int m = t.length-1;
int tPairs[m];
for (int i = 0; i <= m; i++)
if (i == 0) tPairs[i] = [t characterAtIndex:i] << 16;
else if (i == m) tPairs[i-1] |= [t characterAtIndex:i];
else tPairs[i] = (tPairs[i-1] |= [t characterAtIndex:i]) << 16;
// Sort the bigram lists:
qsort(sPairs, n, sizeof(int), compare);
qsort(tPairs, m, sizeof(int), compare);
// Count the matches:
int matches = 0, i = 0, j = 0;
while (i < n && j < m) {
if (sPairs[i] == tPairs[j]) {
matches += 2;
i++;
j++;
} else
if (sPairs[i] < tPairs[j]) i++; else j++;
}
return (float)matches/(n+m);
}
Ruby
require 'set'
# dice coefficient = bigram overlap * 2 / bigrams in a + bigrams in b
def dice_coefficient(a, b)
a_bigrams = a.each_char.each_cons(2).to_set
b_bigrams = b.each_char.each_cons(2).to_set
overlap = (a_bigrams & b_bigrams).size
total = a_bigrams.size + b_bigrams.size
dice = overlap * 2.0 / total
dice
end
C#
public static double DiceCoefficient(string stOne, string stTwo)
{
HashSet<string> nx = new HashSet<string>();
HashSet<string> ny = new HashSet<string>();
for(int i = 0; i < stOne.Length - 1; i++)
{
char x1 = stOne[i];
char x2 = stOne[i + 1];
string temp = x1.ToString() + x2.ToString();
nx.Add(temp);
}
for(int j = 0; j < stTwo.Length - 1; j++)
{
char y1 = stTwo[j];
char y2 = stTwo[j + 1];
string temp = y1.ToString() + y2.ToString();
ny.Add(temp);
}
HashSet<string> intersection = new HashSet<string>(nx);
intersection.IntersectWith(ny);
double dbOne = intersection.Count;
return (2 * dbOne) / (nx.Count + ny.Count);
}
J
bigrams =: (_1}.])(<@,"0)(1}.])
dice_coefficient =: (2&*@~.@%@#)@(bigrams,bigrams)
C
#include <string.h>
double dice_match(const char *string1, const char *string2) {
//check fast cases
if (((string1 != NULL) && (string1[0] == '\0')) ||
((string2 != NULL) && (string2[0] == '\0'))) {
return 0;
}
if (string1 == string2) {
return 1;
}
size_t strlen1 = strlen(string1);
size_t strlen2 = strlen(string2);
if (strlen1 < 2 || strlen2 < 2) {
return 0;
}
size_t length1 = strlen1 - 1;
size_t length2 = strlen2 - 1;
double matches = 0;
int i = 0, j = 0;
//get bigrams and compare
while (i < length1 && j < length2) {
char a[3] = {string1[i], string1[i + 1], '\0'};
char b[3] = {string2[j], string2[j + 1], '\0'};
int cmp = strcmpi(a, b);
if (cmp == 0) {
matches += 2;
}
i++;
j++;
}
return matches / (length1 + length2);
}
This article is issued from Wikibooks. The text is licensed under Creative Commons - Attribution - Sharealike. Additional terms may apply for the media files.