Levenshtein distance improvements

This commit is contained in:
Sad Ellie 2022-05-09 21:17:23 +03:00
parent 63332d9055
commit 033e783e4c
4 changed files with 151 additions and 25 deletions

View File

@ -452,13 +452,7 @@ class MainViewModel @Inject constructor(
} else { } else {
// We are searching for a specific unit, we don't care about popularity // We are searching for a specific unit, we don't care about popularity
// We need search accuracy // We need search accuracy
basicFilteredUnits basicFilteredUnits.sortByLev(query)
.sortedBy {
it.renderedName
.substring(0, minOf(query.length, it.renderedName.length))
.lev(query)
}
.sortedByDescending { it.renderedName.contains(query) }
} }
// Group by unit group // Group by unit group
.groupBy { it.group } .groupBy { it.group }

View File

@ -8,6 +8,7 @@ import com.sadellie.unitto.data.KEY_COMMA
import com.sadellie.unitto.data.KEY_DOT import com.sadellie.unitto.data.KEY_DOT
import com.sadellie.unitto.data.KEY_E import com.sadellie.unitto.data.KEY_E
import com.sadellie.unitto.data.preferences.Separator import com.sadellie.unitto.data.preferences.Separator
import com.sadellie.unitto.data.units.AbstractUnit
import java.math.BigDecimal import java.math.BigDecimal
import java.math.RoundingMode import java.math.RoundingMode
import java.text.NumberFormat import java.text.NumberFormat
@ -75,12 +76,12 @@ object Formatter {
* @param[prefScale] Is the preferred scale, the one which will be compared against * @param[prefScale] Is the preferred scale, the one which will be compared against
*/ */
fun BigDecimal.setMinimumRequiredScale(prefScale: Int): BigDecimal { fun BigDecimal.setMinimumRequiredScale(prefScale: Int): BigDecimal {
/* Here we are getting the amount of zeros in fractional part before non zero value /**
* Here we are getting the amount of zeros in fractional part before non zero value
* For example, for 0.00000123456 we need the length of 00000 * For example, for 0.00000123456 we need the length of 00000
* Next we add one to get the position of the first non zero value * Next we add one to get the position of the first non zero value
*
* Also, this block is only for VERY small numbers * Also, this block is only for VERY small numbers
* */ */
return this.setScale( return this.setScale(
max( max(
prefScale, prefScale,
@ -102,31 +103,33 @@ fun openLink(mContext: Context, url: String) {
mContext.startActivity(Intent(Intent.ACTION_VIEW).setData(Uri.parse(url))) mContext.startActivity(Intent(Intent.ACTION_VIEW).setData(Uri.parse(url)))
} }
/** /**
* Compute Levenshtein Distance. Doesn't really matter which string goes first * Compute Levenshtein Distance. Doesn't really matter which string goes first
* *
* @param stringB Second string * @param stringToCompare Second string
* @return The amount of changes that are needed to transform one string into another * @return The amount of changes that are needed to transform one string into another
*/ */
fun CharSequence.lev(stringB: String): Int { fun String.lev(stringToCompare: String): Int {
// Skipping computation for this cases val stringA = this.lowercase()
if (this == stringB) return 0 val stringB = stringToCompare.lowercase()
if (this.isEmpty()) return stringB.length
// This case is basically unreal in this app, because stringB is a unit name and are never empty
if (stringB.isEmpty()) return this.length
var cost = IntArray(this.length + 1) { it } // Skipping computation for this cases
var newCost = IntArray(this.length + 1) if (stringA == stringB) return 0
if (stringA.isEmpty()) return stringB.length
// This case is basically unreal in this app, because stringToCompare is a unit name and they are never empty
if (stringB.isEmpty()) return stringA.length
var cost = IntArray(stringA.length + 1) { it }
var newCost = IntArray(stringA.length + 1)
for (i in 1..stringB.length) { for (i in 1..stringB.length) {
// basically shifting this to the right by 1 each time // basically shifting this to the right by 1 each time
newCost[0] = i newCost[0] = i
for (j in 1..this.length) { for (j in 1..stringA.length) {
newCost[j] = minOf( newCost[j] = minOf(
// Adding 1 if they don't match, i.e. need to replace // Adding 1 if they don't match, i.e. need to replace
cost[j - 1] + if (this[j - 1] == stringB[i - 1]) 0 else 1, cost[j - 1] + if (stringA[j - 1] == stringB[i - 1]) 0 else 1,
// Insert // Insert
cost[j] + 1, cost[j] + 1,
// Delete // Delete
@ -140,3 +143,58 @@ fun CharSequence.lev(stringB: String): Int {
return cost[this.length] return cost[this.length]
} }
/**
* Sorts sequence of units by Levenshtein distance
*
* @param stringA String for Levenshtein distance
* @return Sorted sequence of units. Units with lower Levenshtein distance are higher
*/
fun Sequence<AbstractUnit>.sortByLev(stringA: String): Sequence<AbstractUnit> {
// We don't need units where name is too different, half of the symbols is wrong in this situation
val threshold = stringA.length / 2
val unitsWithDist = mutableListOf<Pair<AbstractUnit, Int>>()
this.forEach {
/**
* There is chance that unit name doesn't need any edits (contains part of the query)
* So computing levDist is a waste of resources
*
* We just add this unit and assume that levDist is '1'. Not '0' so that such units
* will not be always on top of the list (maybe unit contains query in it's name
* but it's not the desired one so it will compete with other units that are possibly
* the ones user needs)
*/
if (it.renderedName.contains(stringA)) {
unitsWithDist.add(Pair(it, 1))
return@forEach
}
/**
* Levenshtein Distance for this specific name of this unit
*
* We use substring so that we compare not the whole unit name, but only part of it
* It's required because without it levDist will be too high for units with longer
* names than the search query
*
* For example:
* Search query is 'Kelometer' and unit name is 'Kilometer per hour'
* Without substring levDist will be 9 which means that this unit will be skipped
*
* With substring levDist will be 3 so unit will be included
*/
val levDist = it.renderedName
.substring(0, minOf(stringA.length, it.renderedName.length))
.lev(stringA)
// Threshold
if (levDist < threshold) {
unitsWithDist.add(Pair(it, levDist))
}
}
// Sorting by levDist and getting units
return unitsWithDist
.sortedBy { it.second }
.map { it.first }
.asSequence()
}

View File

@ -0,0 +1,69 @@
package com.sadellie.unitto.screens
import com.sadellie.unitto.data.units.AbstractUnit
import com.sadellie.unitto.data.units.MyUnit
import com.sadellie.unitto.data.units.UnitGroup
import org.junit.Assert.assertEquals
import org.junit.Test
import java.math.BigDecimal
val baseList: List<AbstractUnit> = listOf(
"Attometer",
"Nanometer",
"Millimeter",
"Meter",
"Kilometer",
"Mile",
"Pound",
"Kilometer per square"
).map { name ->
MyUnit("", BigDecimal.ONE, UnitGroup.ANGLE, 0, 0)
.also { it.renderedName = name }
}
class LevenshteinFilterAndSortTest {
@Test
fun testOneEdit() {
val searchQuery = "Kelometer"
val result = baseList.asSequence().sortByLev(searchQuery).map { it.renderedName }.toList()
println(result)
assertEquals(
listOf("Kilometer", "Kilometer per square", "Attometer", "Nanometer"),
result
)
}
@Test
fun testLongQuery() {
val searchQuery = "Kelometers per"
val result = baseList.asSequence().sortByLev(searchQuery).map { it.renderedName }.toList()
println(result)
assertEquals(
listOf("Kilometer per square", "Kilometer"),
result
)
}
@Test
fun testMultipleMatches() {
val searchQuery = "meter"
val result = baseList.asSequence().sortByLev(searchQuery).map { it.renderedName }.toList()
println(result)
assertEquals(
listOf("Meter", "Attometer", "Nanometer", "Millimeter", "Kilometer","Kilometer per square"),
result
)
}
@Test
fun testNone() {
val searchQuery = "Very long unit name that doesn't exist"
val result = baseList.asSequence().sortByLev(searchQuery).map { it.renderedName }.toList()
println(result)
assertEquals(
listOf<String>(),
result
)
}
}

View File

@ -40,4 +40,9 @@ class LevenshteinTest {
fun levEmptyB() { fun levEmptyB() {
assertEquals(9, "red truck".lev("")) assertEquals(9, "red truck".lev(""))
} }
@Test
fun levDifferentCases() {
assertEquals(0, "red truck".lev("red TRUCK"))
}
} }