Refactor Tokenizer

This commit is contained in:
Sad Ellie 2024-02-06 18:07:54 +03:00
parent 32eb7422d5
commit 67a4852741
3 changed files with 216 additions and 223 deletions

View File

@ -35,7 +35,7 @@ class Expression(
private val radianMode: Boolean = true, private val radianMode: Boolean = true,
private val roundingMode: RoundingMode = RoundingMode.HALF_EVEN private val roundingMode: RoundingMode = RoundingMode.HALF_EVEN
) { ) {
private val tokens = Tokenizer(input).tokenize() private val tokens = input.tokenize()
private var cursorPosition = 0 private var cursorPosition = 0
/** /**

View File

@ -26,188 +26,169 @@ sealed class TokenizerException(message: String) : Exception(message) {
class BadScientificNotation : TokenizerException("Expected plus or minus symbol after \"E\"") class BadScientificNotation : TokenizerException("Expected plus or minus symbol after \"E\"")
} }
class Tokenizer(private val streamOfTokens: String) { fun String.tokenize(): List<String> {
// Don't create object at all? var cursor = 0
fun tokenize(): List<String> { val tokens: MutableList<String> = mutableListOf()
var cursor = 0
val tokens: MutableList<String> = mutableListOf()
while (cursor != streamOfTokens.length) { while (cursor != this.length) {
val nextToken = peekTokenAfter(cursor) val nextToken = peekTokenAfter(this, cursor)
if (nextToken != null) { if (nextToken != null) {
tokens.add(nextToken) tokens.add(nextToken)
cursor += nextToken.length cursor += nextToken.length
} else { } else {
// Didn't find any token, move left slowly (by 1 symbol) // Didn't find any token, move left slowly (by 1 symbol)
cursor++ cursor++
}
} }
return tokens.repairLexicon()
} }
private fun peekTokenAfter(cursor: Int): String? { return tokens.repairLexicon()
Token.expressionTokens.forEach { token -> }
val subs = streamOfTokens
.substring(
cursor,
(cursor + token.length).coerceAtMost(streamOfTokens.length)
)
if (subs == token) {
// Got a digit, see if there are other digits coming after
if (token in Token.Digit.allWithDot) {
val number = streamOfTokens
.substring(cursor)
.takeWhile { Token.Digit.allWithDot.contains(it.toString()) }
if (number.count { it.toString() == Token.Digit.dot } > 1) { private fun peekTokenAfter(
throw TokenizerException.TooManyFractionSymbols() streamOfTokens: String,
} cursor: Int
): String? {
Token.expressionTokens.forEach { token ->
val subs = streamOfTokens
.substring(
cursor,
(cursor + token.length).coerceAtMost(streamOfTokens.length)
)
if (subs == token) {
// Got a digit, see if there are other digits coming after
if (token in Token.Digit.allWithDot) {
val number = streamOfTokens
.substring(cursor)
.takeWhile { Token.Digit.allWithDot.contains(it.toString()) }
return number if (number.count { it.toString() == Token.Digit.dot } > 1) {
} throw TokenizerException.TooManyFractionSymbols()
return token
}
}
return null
}
private fun List<String>.repairLexicon(): List<String> {
return this
.missingClosingBrackets()
.unpackNotation()
.missingMultiply()
.unpackAllPercents()
// input like 80%80% should be treated as 80%*80%.
// After unpacking we get (80/100)(80/100), the multiply is missing (!!!)
// No, we can't unpack before fixing missing multiply.
// Ideally we we need to add missing multiply for 80%80%
// In that case unpackAllPercents gets input with all operators 80%*80% in this case
// Can't be done right now since missingMultiply checks for tokens in front only
.missingMultiply()
}
private fun List<String>.missingClosingBrackets(): List<String> {
val leftBracket = this.count { it == Token.Operator.leftBracket }
val rightBrackets = this.count { it == Token.Operator.rightBracket }
val neededBrackets = leftBracket - rightBrackets
if (neededBrackets <= 0) return this
var fixed = this
repeat(neededBrackets) {
fixed = fixed + Token.Operator.rightBracket
}
return fixed
}
private fun List<String>.missingMultiply(): List<String> {
val result = this.toMutableList()
val original = this
var offset = 0
fun addTokenAfter(index: Int) {
result.add(index + 1 + offset, Token.Operator.multiply)
offset += 1
}
original.forEachIndexed { index, token ->
when {
// This will not insert multiply between digits because they are grouped into a
// single token. It's not possible to get separate digit tokens near each other
// Things like ["123", "456"] are impossible, will be ["123456"]
token.isDigitToken() ||
token in Token.Const.all ||
token == Token.Operator.rightBracket -> {
val tokenInFront = original.tokenInFront(index) ?: return@forEachIndexed
when {
tokenInFront == Token.Operator.leftBracket ||
tokenInFront in Token.Func.all ||
tokenInFront in Token.Const.all ||
tokenInFront == Token.Operator.sqrt ||
tokenInFront.isDigitToken() -> {
addTokenAfter(index)
}
}
}
}
}
return result
}
private fun List<String>.unpackAllPercents(): List<String> {
var result = this
while (result.contains(Token.Operator.percent)) {
val percIndex = result.indexOf(Token.Operator.percent)
result = result.unpackPercentAt(percIndex)
}
return result
}
private fun List<String>.unpackNotation(): List<String> {
// Transform 1E+7 ==> 1*10^7
// Transform 1E-7 ==> 1/10^7
val result = this.toMutableList()
val listIterator = result.listIterator()
while (listIterator.hasNext()) {
if (listIterator.next() == Token.DisplayOnly.engineeringE) {
listIterator.remove()
val tokenAfterE = try {
listIterator.next()
} catch (e: Exception) {
throw TokenizerException.BadScientificNotation()
} }
listIterator.remove() return number
}
return token
}
}
return null
}
when (tokenAfterE) { private fun MutableList<String>.repairLexicon(): List<String> {
Token.Operator.minus -> listIterator.add(Token.Operator.divide) return this
Token.Operator.plus -> listIterator.add(Token.Operator.multiply) .missingClosingBrackets()
else -> throw TokenizerException.BadScientificNotation() .unpackNotation()
} .missingMultiply()
.unpackAllPercents()
// input like 80%80% should be treated as 80%*80%.
// After unpacking we get (80/100)(80/100), the multiply is missing (!!!)
// No, we can't unpack before fixing missing multiply.
// Ideally we we need to add missing multiply for 80%80%
// In that case unpackAllPercents gets input with all operators 80%*80% in this case
// Can't be done right now since missingMultiply checks for tokens in front only
.missingMultiply()
}
listIterator.add("10") private fun MutableList<String>.missingClosingBrackets(): MutableList<String> {
listIterator.add(Token.Operator.power) val leftBracket = this.count { it == Token.Operator.leftBracket }
val rightBrackets = this.count { it == Token.Operator.rightBracket }
val neededBrackets = leftBracket - rightBrackets
if (neededBrackets <= 0) return this
repeat(neededBrackets) {
this.add(Token.Operator.rightBracket)
}
return this
}
private fun MutableList<String>.missingMultiply(): MutableList<String> {
val iterator = this.listIterator()
while (iterator.hasNext()) {
val currentToken = iterator.next()
// Need two token for checks
if (!iterator.hasNext()) break
val isDigit = currentToken.isDigitToken()
val isConst = currentToken in Token.Const.all
val isRightBracket = currentToken == Token.Operator.rightBracket
// may need a multiplication after
if (isDigit || isConst || isRightBracket) {
// Peek next, but then go back
val tokenAfter = iterator.next()
iterator.previous()
if (tokenAfter == Token.Operator.leftBracket ||
tokenAfter in Token.Func.all ||
tokenAfter in Token.Const.all ||
tokenAfter == Token.Operator.sqrt ||
tokenAfter.isDigitToken()) {
iterator.add(Token.Operator.multiply)
} }
} }
return result
} }
private fun List<String>.unpackPercentAt(percentIndex: Int): List<String> { return this
var cursor = percentIndex }
// get whatever is the percentage private fun MutableList<String>.unpackNotation(): MutableList<String> {
val percentage = this.getNumberOrExpressionBefore(percentIndex) // Transform 1E+7 ==> 1*10^7
// Move cursor // Transform 1E-7 ==> 1/10^7
cursor -= percentage.size val iterator = this.listIterator()
// get the operator in front while (iterator.hasNext()) {
cursor -= 1 if (iterator.next() == Token.DisplayOnly.engineeringE) {
val operator = this.getOrNull(cursor) iterator.remove()
// Don't go further val tokenAfterE = try {
if ((operator == null) or (operator !in listOf(Token.Operator.plus, Token.Operator.minus))) { iterator.next()
val mutList = this.toMutableList() } catch (e: Exception) {
throw TokenizerException.BadScientificNotation()
}
// Remove percentage iterator.remove()
mutList.removeAt(percentIndex)
//Add opening bracket before percentage when (tokenAfterE) {
mutList.add(percentIndex - percentage.size, Token.Operator.leftBracket) Token.Operator.minus -> iterator.add(Token.Operator.divide)
Token.Operator.plus -> iterator.add(Token.Operator.multiply)
else -> throw TokenizerException.BadScientificNotation()
}
// Add "/ 100" and closing bracket iterator.add("10")
mutList.addAll(percentIndex + 1, listOf(Token.Operator.divide, "100", Token.Operator.rightBracket)) iterator.add(Token.Operator.power)
return mutList
} }
// Get the base }
val base = this.getBaseBefore(cursor)
return this
}
private fun MutableList<String>.unpackAllPercents(): MutableList<String> {
var result = this
while (result.contains(Token.Operator.percent)) {
val percIndex = result.indexOf(Token.Operator.percent)
result = result.unpackPercentAt(percIndex)
}
return result
}
private fun MutableList<String>.unpackPercentAt(percentIndex: Int): MutableList<String> {
var cursor = percentIndex
// get whatever is the percentage
val percentage = this.getNumberOrExpressionBefore(percentIndex)
// Move cursor
cursor -= percentage.size
// get the operator in front
cursor -= 1
val operator = this.getOrNull(cursor)
// Don't go further
if ((operator == null) or (operator !in listOf(Token.Operator.plus, Token.Operator.minus))) {
val mutList = this.toMutableList() val mutList = this.toMutableList()
// Remove percentage // Remove percentage
@ -216,71 +197,83 @@ class Tokenizer(private val streamOfTokens: String) {
//Add opening bracket before percentage //Add opening bracket before percentage
mutList.add(percentIndex - percentage.size, Token.Operator.leftBracket) mutList.add(percentIndex - percentage.size, Token.Operator.leftBracket)
// Add "/ 100" and other stuff // Add "/ 100" and closing bracket
mutList.addAll( mutList.addAll(percentIndex + 1, listOf(Token.Operator.divide, "100", Token.Operator.rightBracket))
percentIndex + 1,
listOf(
Token.Operator.divide,
"100",
Token.Operator.multiply,
Token.Operator.leftBracket,
*base.toTypedArray(),
Token.Operator.rightBracket,
Token.Operator.rightBracket
)
)
return mutList return mutList
} }
// Get the base
val base = this.getBaseBefore(cursor)
val mutList = this.toMutableList()
private fun List<String>.getNumberOrExpressionBefore(pos: Int): List<String> { // Remove percentage
val digits = Token.Digit.allWithDot.map { it[0] } mutList.removeAt(percentIndex)
val tokenInFront = this[pos - 1] //Add opening bracket before percentage
mutList.add(percentIndex - percentage.size, Token.Operator.leftBracket)
// Just number // Add "/ 100" and other stuff
if (tokenInFront.all { it in digits }) return listOf(tokenInFront) mutList.addAll(
percentIndex + 1,
listOf(
Token.Operator.divide,
"100",
Token.Operator.multiply,
Token.Operator.leftBracket,
*base.toTypedArray(),
Token.Operator.rightBracket,
Token.Operator.rightBracket
)
)
// For cases like "100+(2+5)|%". The check above won't pass, so the next expected thing is return mutList
// a number in brackets. Anything else is not expected.
if (tokenInFront != Token.Operator.rightBracket) throw TokenizerException.FailedToUnpackNumber()
// Start walking left until we get balanced brackets
var cursor = pos - 1
var leftBrackets = 0
var rightBrackets = 1 // We set 1 because we start with closing bracket
while (leftBrackets != rightBrackets) {
cursor--
val currentToken = this[cursor]
if (currentToken == Token.Operator.leftBracket) leftBrackets++
if (currentToken == Token.Operator.rightBracket) rightBrackets++
}
return this.subList(cursor, pos)
}
private fun List<String>.getBaseBefore(pos: Int): List<String> {
var cursor = pos
var leftBrackets = 0
var rightBrackets = 0
while ((--cursor >= 0)) {
val currentToken = this[cursor]
if (currentToken == Token.Operator.leftBracket) leftBrackets++
if (currentToken == Token.Operator.rightBracket) rightBrackets++
if (leftBrackets > rightBrackets) break
}
// Return cursor back to last token
cursor += 1
return this.subList(cursor, pos)
}
private fun String.isDigitToken(): Boolean = first().toString() in Token.Digit.allWithDot
private fun List<String>.tokenInFront(index: Int): String? = getOrNull(index + 1)
} }
private fun MutableList<String>.getNumberOrExpressionBefore(pos: Int): List<String> {
val digits = Token.Digit.allWithDot.map { it[0] }
val tokenInFront = this[pos - 1]
// Just number
if (tokenInFront.all { it in digits }) return listOf(tokenInFront)
// For cases like "100+(2+5)|%". The check above won't pass, so the next expected thing is
// a number in brackets. Anything else is not expected.
if (tokenInFront != Token.Operator.rightBracket) throw TokenizerException.FailedToUnpackNumber()
// Start walking left until we get balanced brackets
var cursor = pos - 1
var leftBrackets = 0
var rightBrackets = 1 // We set 1 because we start with closing bracket
while (leftBrackets != rightBrackets) {
cursor--
val currentToken = this[cursor]
if (currentToken == Token.Operator.leftBracket) leftBrackets++
if (currentToken == Token.Operator.rightBracket) rightBrackets++
}
return this.subList(cursor, pos)
}
private fun List<String>.getBaseBefore(pos: Int): List<String> {
var cursor = pos
var leftBrackets = 0
var rightBrackets = 0
while ((--cursor >= 0)) {
val currentToken = this[cursor]
if (currentToken == Token.Operator.leftBracket) leftBrackets++
if (currentToken == Token.Operator.rightBracket) rightBrackets++
if (leftBrackets > rightBrackets) break
}
// Return cursor back to last token
cursor += 1
return this.subList(cursor, pos)
}
private fun String.isDigitToken(): Boolean = first().toString() in Token.Digit.allWithDot

View File

@ -40,7 +40,7 @@ fun <T : Throwable?> assertExprFail(
} }
fun assertLex(expected: List<String>, actual: String) = fun assertLex(expected: List<String>, actual: String) =
assertEquals(expected, Tokenizer(actual).tokenize()) assertEquals(expected, actual.tokenize())
fun assertLex(expected: String, actual: String) = fun assertLex(expected: String, actual: String) =
assertEquals(expected, Tokenizer(actual).tokenize().joinToString("")) assertEquals(expected, actual.tokenize().joinToString(""))