master-thesis/util/cliffsDelta.py

50 lines
1.3 KiB
Python
Executable File

from __future__ import division
def cliffsDelta(lst1, lst2, **dull):
"""Returns delta and true if there are more than 'dull' differences"""
if not dull:
dull = {'small': 0.147, 'medium': 0.33, 'large': 0.474} # effect sizes from (Hess and Kromrey, 2004)
m, n = len(lst1), len(lst2)
lst2 = sorted(lst2)
j = more = less = 0
for repeats, x in runs(sorted(lst1)):
while j <= (n - 1) and lst2[j] < x:
j += 1
more += j*repeats
while j <= (n - 1) and lst2[j] == x:
j += 1
less += (n - j)*repeats
d = (more - less) / (m*n)
size = lookup_size(d, dull)
return d, size
def lookup_size(delta: float, dull: dict) -> str:
"""
:type delta: float
:type dull: dict, a dictionary of small, medium, large thresholds.
"""
delta = abs(delta)
if delta < dull['small']:
return 'negligible'
if dull['small'] <= delta < dull['medium']:
return 'small'
if dull['medium'] <= delta < dull['large']:
return 'medium'
if delta >= dull['large']:
return 'large'
def runs(lst):
"""Iterator, chunks repeated values"""
for j, two in enumerate(lst):
if j == 0:
one, i = two, 0
if one != two:
yield j - i, one
i = j
one = two
yield j - i + 1, two