ByteArray64Writer to handle the UTF-8 correctly

This commit is contained in:
minjaesong
2021-09-01 11:43:01 +09:00
parent 3b37e91e4a
commit 07f26a7716
3 changed files with 130 additions and 95 deletions

View File

@@ -4,6 +4,8 @@ import net.torvald.terrarum.console.ConsoleCommand
import net.torvald.terrarum.console.Echo import net.torvald.terrarum.console.Echo
import net.torvald.terrarum.serialise.ByteArray64Reader import net.torvald.terrarum.serialise.ByteArray64Reader
import net.torvald.terrarum.serialise.ByteArray64Writer import net.torvald.terrarum.serialise.ByteArray64Writer
import net.torvald.terrarum.serialise.Common
import net.torvald.terrarum.serialise.toUint
import java.io.File import java.io.File
/** /**
@@ -30,4 +32,19 @@ object ReaderTest : ConsoleCommand {
override fun printUsage() { override fun printUsage() {
Echo("Usage: readertest") Echo("Usage: readertest")
} }
}
object WriterTest : ConsoleCommand {
override fun execute(args: Array<String>) {
val str = "\ud83c\udfde"
val baw = ByteArray64Writer(Common.CHARSET)
str.forEach { baw.write(it.toInt()) }
baw.close()
baw.toByteArray64().forEach { print(it.toUint().toString(16).uppercase().padStart(2,'0')); print(" ") }
println()
}
override fun printUsage() {
Echo("Usage: writertest")
}
} }

View File

@@ -15,6 +15,7 @@ import net.torvald.terrarum.tail
import net.torvald.terrarum.utils.* import net.torvald.terrarum.utils.*
import org.apache.commons.codec.digest.DigestUtils import org.apache.commons.codec.digest.DigestUtils
import java.io.Reader import java.io.Reader
import java.io.StringReader
import java.io.Writer import java.io.Writer
import java.math.BigInteger import java.math.BigInteger
import java.nio.channels.ClosedChannelException import java.nio.channels.ClosedChannelException
@@ -182,12 +183,41 @@ object Common {
* @return Bytes in [b] which are GZip'd then Ascii85-encoded * @return Bytes in [b] which are GZip'd then Ascii85-encoded
*/ */
private fun blockLayerToStr(b: BlockLayer): String { private fun blockLayerToStr(b: BlockLayer): String {
return bytesToZipdStr(b.bytesIterator())
}
private fun strToBlockLayer(layerInfo: LayerInfo): BlockLayer {
val layer = BlockLayer(layerInfo.x, layerInfo.y)
val unzipdBytes = strToBytes(StringReader(layerInfo.b))
// write to blocklayer and the digester
digester.reset()
var writeCursor = 0L
unzipdBytes.forEach {
if (writeCursor < layer.ptr.size) {
layer.ptr[writeCursor] = it
digester.update(it)
writeCursor += 1
}
}
// check hash
val hash = StringBuilder().let { sb -> digester.digest().forEach { sb.append(it.tostr()) }; sb.toString() }
if (hash != layerInfo.h) {
throw BlockLayerHashMismatchError(layerInfo.h, hash, layer)
}
return layer
}
fun bytesToZipdStr(byteIterator: Iterator<Byte>): String {
val sb = StringBuilder() val sb = StringBuilder()
val bo = ByteArray64GrowableOutputStream() val bo = ByteArray64GrowableOutputStream()
val zo = GZIPOutputStream(bo) val zo = GZIPOutputStream(bo)
// zip // zip
b.bytesIterator().forEach { byteIterator.forEach {
zo.write(it.toInt()) zo.write(it.toInt())
} }
zo.flush(); zo.close() zo.flush(); zo.close()
@@ -210,21 +240,22 @@ object Common {
return sb.toString() return sb.toString()
} }
private fun strToBlockLayer(layerInfo: LayerInfo): BlockLayer { fun strToBytes(reader: Reader): ByteArray64 {
val layer = BlockLayer(layerInfo.x, layerInfo.y)
val unasciidBytes = ByteArray64() val unasciidBytes = ByteArray64()
val unzipdBytes = ByteArray64() val unzipdBytes = ByteArray64()
// unascii // unascii
var bai = 0 var bai = 0
val buf = CharArray(5) { Ascii85.PAD_CHAR } val buf = CharArray(5) { Ascii85.PAD_CHAR }
layerInfo.b.forEach { while (true) {
val char = reader.read()
if (char < 0) break
if (bai > 0 && bai % 5 == 0) { if (bai > 0 && bai % 5 == 0) {
Ascii85.decode(buf[0], buf[1], buf[2], buf[3], buf[4]).forEach { unasciidBytes.add(it) } Ascii85.decode(buf[0], buf[1], buf[2], buf[3], buf[4]).forEach { unasciidBytes.add(it) }
buf.fill(Ascii85.PAD_CHAR) buf.fill(Ascii85.PAD_CHAR)
} }
buf[bai % 5] = it buf[bai % 5] = char.toChar()
bai += 1 bai += 1
}; Ascii85.decode(buf[0], buf[1], buf[2], buf[3], buf[4]).forEach { unasciidBytes.add(it) } }; Ascii85.decode(buf[0], buf[1], buf[2], buf[3], buf[4]).forEach { unasciidBytes.add(it) }
@@ -238,43 +269,22 @@ object Common {
} }
zi.close() zi.close()
// write to blocklayer and the digester return unzipdBytes
digester.reset()
var writeCursor = 0L
val sb = StringBuilder()
unzipdBytes.forEach {
if (writeCursor < layer.ptr.size) {
if (writeCursor < 1024) {
sb.append("${it.tostr()} ")
}
layer.ptr[writeCursor] = it
digester.update(it)
writeCursor += 1
}
}
// printdbg(this, "post: $sb")
// check hash
val hash = StringBuilder().let { sb -> digester.digest().forEach { sb.append(it.tostr()) }; sb.toString() }
if (hash != layerInfo.h) {
throw BlockLayerHashMismatchError(layerInfo.h, hash, layer)
}
return layer
} }
} }
class ByteArray64Writer(val charset: Charset) : Writer() { class ByteArray64Writer(val charset: Charset) : Writer() {
private var closed = false private val acceptableCharsets = arrayOf(Charsets.UTF_8, Charset.forName("CP437"))
init {
if (!acceptableCharsets.contains(charset))
throw UnsupportedCharsetException(charset.name())
}
private val ba64 = ByteArray64() private val ba64 = ByteArray64()
private var closed = false
private var surrogateBuf = 0
init { init {
this.lock = ba64 this.lock = ba64
@@ -284,9 +294,54 @@ class ByteArray64Writer(val charset: Charset) : Writer() {
if (closed) throw ClosedChannelException() if (closed) throw ClosedChannelException()
} }
private fun Int.isSurroHigh() = this.ushr(10) == 0b110110
private fun Int.isSurroLow() = this.ushr(10) == 0b110111
private fun Int.toUcode() = 'u' + this.toString(16).uppercase().padStart(4,'0')
/**
* @param c not a freakin' codepoint; just a Java's Char casted into Int
*/
override fun write(c: Int) { override fun write(c: Int) {
checkOpen() checkOpen()
"${c.toChar()}".toByteArray(charset).forEach { ba64.add(it) } when (charset) {
Charsets.UTF_8 -> {
if (surrogateBuf == 0 && !c.isSurroHigh() && !c.isSurroLow())
writeUtf8Codepoint(c)
else if (surrogateBuf == 0 && c.isSurroHigh())
surrogateBuf = c
else if (surrogateBuf != 0 && c.isSurroLow())
writeUtf8Codepoint(65536 + surrogateBuf.and(1023).shl(10) or c.and(1023))
// invalid surrogate pair input
else
throw IllegalStateException("Surrogate high: ${surrogateBuf.toUcode()}, surrogate low: ${c.toUcode()}")
}
Charset.forName("CP437") -> {
ba64.add(c.toByte())
}
else -> throw UnsupportedCharsetException(charset.name())
}
}
fun writeUtf8Codepoint(codepoint: Int) {
when (codepoint) {
in 0..127 -> ba64.add(codepoint.toByte())
in 128..2047 -> {
ba64.add((0xC0 or codepoint.ushr(6).and(31)).toByte())
ba64.add((0x80 or codepoint.and(63)).toByte())
}
in 2048..65535 -> {
ba64.add((0xE0 or codepoint.ushr(12).and(15)).toByte())
ba64.add((0x80 or codepoint.ushr(6).and(63)).toByte())
ba64.add((0x80 or codepoint.and(63)).toByte())
}
in 65536..1114111 -> {
ba64.add((0xF0 or codepoint.ushr(18).and(7)).toByte())
ba64.add((0x80 or codepoint.ushr(12).and(63)).toByte())
ba64.add((0x80 or codepoint.ushr(6).and(63)).toByte())
ba64.add((0x80 or codepoint.and(63)).toByte())
}
else -> throw IllegalArgumentException("Not a unicode code point: U+${codepoint.toString(16).uppercase()}")
}
} }
override fun write(cbuf: CharArray) { override fun write(cbuf: CharArray) {

View File

@@ -3,12 +3,16 @@ package net.torvald.terrarum.serialise
import com.badlogic.gdx.utils.compression.Lzma import com.badlogic.gdx.utils.compression.Lzma
import net.torvald.terrarum.ModMgr import net.torvald.terrarum.ModMgr
import net.torvald.terrarum.gameactors.Actor import net.torvald.terrarum.gameactors.Actor
import net.torvald.terrarum.gameworld.BlockLayer
import net.torvald.terrarum.modulebasegame.TerrarumIngame import net.torvald.terrarum.modulebasegame.TerrarumIngame
import net.torvald.terrarum.modulebasegame.worldgenerator.RoguelikeRandomiser import net.torvald.terrarum.modulebasegame.worldgenerator.RoguelikeRandomiser
import net.torvald.terrarum.modulecomputers.virtualcomputer.tvd.ByteArray64 import net.torvald.terrarum.modulecomputers.virtualcomputer.tvd.ByteArray64
import net.torvald.terrarum.modulecomputers.virtualcomputer.tvd.ByteArray64GrowableOutputStream import net.torvald.terrarum.modulecomputers.virtualcomputer.tvd.ByteArray64GrowableOutputStream
import net.torvald.terrarum.modulecomputers.virtualcomputer.tvd.ByteArray64InputStream
import net.torvald.terrarum.weather.WeatherMixer import net.torvald.terrarum.weather.WeatherMixer
import java.io.ByteArrayInputStream import java.io.ByteArrayInputStream
import java.io.StringReader
import java.util.zip.GZIPInputStream
import java.util.zip.GZIPOutputStream import java.util.zip.GZIPOutputStream
/** /**
@@ -36,28 +40,28 @@ open class WriteMeta(val ingame: TerrarumIngame) {
it.append("\n\n## module: $modname ##\n\n") it.append("\n\n## module: $modname ##\n\n")
it.append(file.readText()) it.append(file.readText())
} }
bytesToZipdStr(it.toString().toByteArray(Common.CHARSET)) zipStrAndEnascii(it.toString())
}}", }}",
"items": "${StringBuilder().let { "items": "${StringBuilder().let {
ModMgr.getFilesFromEveryMod("items/itemid.csv").forEach { (modname, file) -> ModMgr.getFilesFromEveryMod("items/itemid.csv").forEach { (modname, file) ->
it.append("\n\n## module: $modname ##\n\n") it.append("\n\n## module: $modname ##\n\n")
it.append(file.readText()) it.append(file.readText())
} }
bytesToZipdStr(it.toString().toByteArray(Common.CHARSET)) zipStrAndEnascii(it.toString())
}}", }}",
"wires": "${StringBuilder().let { "wires": "${StringBuilder().let {
ModMgr.getFilesFromEveryMod("wires/wires.csv").forEach { (modname, file) -> ModMgr.getFilesFromEveryMod("wires/wires.csv").forEach { (modname, file) ->
it.append("\n\n## module: $modname ##\n\n") it.append("\n\n## module: $modname ##\n\n")
it.append(file.readText()) it.append(file.readText())
} }
bytesToZipdStr(it.toString().toByteArray(Common.CHARSET)) zipStrAndEnascii(it.toString())
}}", }}",
"materials": "${StringBuilder().let { "materials": "${StringBuilder().let {
ModMgr.getFilesFromEveryMod("materials/materials.csv").forEach { (modname, file) -> ModMgr.getFilesFromEveryMod("materials/materials.csv").forEach { (modname, file) ->
it.append("\n\n## module: $modname ##\n\n") it.append("\n\n## module: $modname ##\n\n")
it.append(file.readText()) it.append(file.readText())
} }
bytesToZipdStr(it.toString().toByteArray(Common.CHARSET)) zipStrAndEnascii(it.toString())
}}", }}",
"loadorder": [${ModMgr.loadOrder.map { "\"${it}\"" }.joinToString()}], "loadorder": [${ModMgr.loadOrder.map { "\"${it}\"" }.joinToString()}],
"worlds": [${ingame.gameworldIndices.joinToString()}] "worlds": [${ingame.gameworldIndices.joinToString()}]
@@ -71,63 +75,22 @@ open class WriteMeta(val ingame: TerrarumIngame) {
this.invoke().toByteArray(Common.CHARSET).forEach { ba.add(it) } this.invoke().toByteArray(Common.CHARSET).forEach { ba.add(it) }
return ba return ba
} }
}
/** data class WorldMeta(
* @param b a ByteArray val genver: Int,
* @return Bytes in [b] which are GZip'd then Ascii85-encoded val savename: String
*/ )
fun bytesToZipdStr(b: ByteArray): String {
val sb = StringBuilder()
val bo = ByteArray64GrowableOutputStream()
val zo = GZIPOutputStream(bo)
b.forEach { /**
zo.write(it.toInt()) * @param [s] a String
* @return UTF-8 encoded [s] which are GZip'd then Ascii85-encoded
*/
fun zipStrAndEnascii(s: String): String {
return Common.bytesToZipdStr(s.toByteArray(Common.CHARSET).iterator())
} }
zo.flush(); zo.close()
val ba = bo.toByteArray64() fun unasciiAndUnzipStr(s: String): String {
var bai = 0 return ByteArray64Reader(Common.strToBytes(StringReader(s)), Common.CHARSET).readText()
val buf = IntArray(4) { Ascii85.PAD_BYTE } }
ba.forEach {
if (bai > 0 && bai % 4 == 0) {
sb.append(Ascii85.encode(buf[0], buf[1], buf[2], buf[3]))
buf.fill(Ascii85.PAD_BYTE)
}
buf[bai % 4] = it.toInt() and 255
bai += 1
}; sb.append(Ascii85.encode(buf[0], buf[1], buf[2], buf[3]))
return sb.toString()
} }
/**
* @param b a ByteArray
* @return Bytes in [b] which are LZMA'd then Ascii85-encoded
*/
fun bytesToLzmadStr(b: ByteArray): String {
val sb = StringBuilder()
val bi = ByteArrayInputStream(b)
val bo = ByteArray64GrowableOutputStream()
Lzma.compress(bi, bo); bo.flush(); bo.close()
val ba = bo.toByteArray64()
var bai = 0
val buf = IntArray(4) { Ascii85.PAD_BYTE }
ba.forEach {
if (bai > 0 && bai % 4 == 0) {
sb.append(Ascii85.encode(buf[0], buf[1], buf[2], buf[3]))
buf.fill(Ascii85.PAD_BYTE)
}
buf[bai % 4] = it.toInt() and 255
bai += 1
}; sb.append(Ascii85.encode(buf[0], buf[1], buf[2], buf[3]))
return sb.toString()
}