Give up on app-i18n/ibus-kkc
Signed-off-by: Shin'ya Minazuki <shinyoukai@laidback.moe>
This commit is contained in:
@@ -1 +0,0 @@
|
|||||||
DIST libkkc-data-0.2.7.tar.xz 22262552 BLAKE2B 2c735ee9fabf8f8f201591c9ed584cece22ddcd15da5f36b39bb422b1bce1dbcbcd66f71b5713e2dd4c5e2862b06b014c24a4a3db63c86ecee20519434da9261 SHA512 61c0cd8c0fa41ed8df49cac6709eebb245cc965d7e192b1ba945e95f2fc46aca8aa48c16e1977a12c157c55dab6b9f4c30f4905806725eca6e697b762eb7cbd7
|
|
||||||
@@ -1,121 +0,0 @@
|
|||||||
#!/usr/bin/python3.13
|
|
||||||
|
|
||||||
# Copyright (C) 2011-2013 Daiki Ueno <ueno@gnu.org>
|
|
||||||
# Copyright (C) 2011-2013 Red Hat, Inc.
|
|
||||||
|
|
||||||
# This program is free software: you can redistribute it and/or modify
|
|
||||||
# it under the terms of the GNU General Public License as published by
|
|
||||||
# the Free Software Foundation, either version 3 of the License, or
|
|
||||||
# (at your option) any later version.
|
|
||||||
|
|
||||||
# This program is distributed in the hope that it will be useful,
|
|
||||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
||||||
# GNU General Public License for more details.
|
|
||||||
|
|
||||||
# You should have received a copy of the GNU General Public License
|
|
||||||
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
||||||
|
|
||||||
import os
|
|
||||||
import mmap
|
|
||||||
import math
|
|
||||||
import struct
|
|
||||||
|
|
||||||
ERROR_RATE = 0.25
|
|
||||||
|
|
||||||
def murmur_hash3_32(b0, b1, seed):
|
|
||||||
h1 = seed
|
|
||||||
|
|
||||||
c1 = 0xcc9e2d51
|
|
||||||
c2 = 0x1b873593
|
|
||||||
|
|
||||||
# body: b0
|
|
||||||
b0 *= c1
|
|
||||||
b0 &= 0xFFFFFFFF
|
|
||||||
b0 = (b0 << 15) | (b0 >> (32 - 15))
|
|
||||||
b0 &= 0xFFFFFFFF
|
|
||||||
b0 *= c2
|
|
||||||
b0 &= 0xFFFFFFFF
|
|
||||||
|
|
||||||
h1 ^= b0
|
|
||||||
h1 &= 0xFFFFFFFF
|
|
||||||
h1 = (h1 << 13) | (h1 >> (32 - 13))
|
|
||||||
h1 &= 0xFFFFFFFF
|
|
||||||
h1 = h1 * 5 + 0xe6546b64
|
|
||||||
h1 &= 0xFFFFFFFF
|
|
||||||
|
|
||||||
# body: b1
|
|
||||||
b1 *= c1
|
|
||||||
b1 &= 0xFFFFFFFF
|
|
||||||
b1 = (b1 << 15) | (b1 >> (32 - 15))
|
|
||||||
b1 &= 0xFFFFFFFF
|
|
||||||
b1 *= c2
|
|
||||||
b1 &= 0xFFFFFFFF
|
|
||||||
|
|
||||||
h1 ^= b1
|
|
||||||
h1 &= 0xFFFFFFFF
|
|
||||||
h1 = (h1 << 13) | (h1 >> (32 - 13))
|
|
||||||
h1 &= 0xFFFFFFFF
|
|
||||||
h1 = h1 * 5 + 0xe6546b64
|
|
||||||
h1 &= 0xFFFFFFFF
|
|
||||||
|
|
||||||
# No tail processing needed.
|
|
||||||
|
|
||||||
# fmix
|
|
||||||
h1 ^= 8
|
|
||||||
h1 &= 0xFFFFFFFF
|
|
||||||
h1 ^= h1 >> 16
|
|
||||||
h1 &= 0xFFFFFFFF
|
|
||||||
h1 *= 0x85ebca6b
|
|
||||||
h1 &= 0xFFFFFFFF
|
|
||||||
h1 ^= h1 >> 13
|
|
||||||
h1 &= 0xFFFFFFFF
|
|
||||||
h1 *= 0xc2b2ae35
|
|
||||||
h1 &= 0xFFFFFFFF
|
|
||||||
h1 ^= h1 >> 16
|
|
||||||
h1 &= 0xFFFFFFFF
|
|
||||||
return h1
|
|
||||||
|
|
||||||
class FilterGenerator(object):
|
|
||||||
def __init__(self, infile, outfile, record_size):
|
|
||||||
self.infile = infile
|
|
||||||
self.outfile = outfile
|
|
||||||
self.record_size = record_size
|
|
||||||
|
|
||||||
def generate(self):
|
|
||||||
size = os.fstat(self.infile.fileno()).st_size
|
|
||||||
n = size / self.record_size
|
|
||||||
m = int(math.ceil(-n*math.log10(ERROR_RATE) /
|
|
||||||
math.pow(math.log10(2), 2)))
|
|
||||||
m = (m/8 + 1)*8
|
|
||||||
inmem = mmap.mmap(self.infile.fileno(),
|
|
||||||
size,
|
|
||||||
access=mmap.ACCESS_READ)
|
|
||||||
outmem = bytearray(m/8)
|
|
||||||
for i in range(0, n):
|
|
||||||
offset = i*self.record_size
|
|
||||||
b0, b1 = struct.unpack("=LL", inmem[offset:offset+8])
|
|
||||||
for k in range(0, 4):
|
|
||||||
h = murmur_hash3_32(b0, b1, k)
|
|
||||||
h = int(h * (m / float(0xFFFFFFFF)))
|
|
||||||
outmem[h/8] |= (1 << (h%8))
|
|
||||||
inmem.close()
|
|
||||||
# Convert bytearray to str, for Python 2.6 compatibility.
|
|
||||||
self.outfile.write(str(outmem))
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
import sys
|
|
||||||
import argparse
|
|
||||||
|
|
||||||
parser = argparse.ArgumentParser(description='filter')
|
|
||||||
parser.add_argument('infile', type=argparse.FileType('r'),
|
|
||||||
help='input file')
|
|
||||||
parser.add_argument('outfile', type=argparse.FileType('w'),
|
|
||||||
help='output file')
|
|
||||||
parser.add_argument('record_size', type=int,
|
|
||||||
help='record size')
|
|
||||||
args = parser.parse_args()
|
|
||||||
generator = FilterGenerator(args.infile,
|
|
||||||
args.outfile,
|
|
||||||
args.record_size)
|
|
||||||
generator.generate()
|
|
||||||
@@ -1,188 +0,0 @@
|
|||||||
#!/usr/bin/python
|
|
||||||
|
|
||||||
# Copyright (C) 2011-2013 Daiki Ueno <ueno@gnu.org>
|
|
||||||
# Copyright (C) 2011-2013 Red Hat, Inc.
|
|
||||||
|
|
||||||
# This program is free software: you can redistribute it and/or modify
|
|
||||||
# it under the terms of the GNU General Public License as published by
|
|
||||||
# the Free Software Foundation, either version 3 of the License, or
|
|
||||||
# (at your option) any later version.
|
|
||||||
|
|
||||||
# This program is distributed in the hope that it will be useful,
|
|
||||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
||||||
# GNU General Public License for more details.
|
|
||||||
|
|
||||||
# You should have received a copy of the GNU General Public License
|
|
||||||
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
||||||
|
|
||||||
import struct
|
|
||||||
import marisa
|
|
||||||
import re
|
|
||||||
|
|
||||||
NGRAM = 3
|
|
||||||
NGRAM_LINE_REGEX = '^([-0-9.]+)[ \t]+([^\t]+?)(?:[ \t]+([-0-9.]+))?$'
|
|
||||||
|
|
||||||
class SortedGenerator(object):
|
|
||||||
def __init__(self, infile, output_prefix):
|
|
||||||
self.__infile = infile
|
|
||||||
self.__output_prefix = output_prefix
|
|
||||||
self.__ngram_line_regex = re.compile(NGRAM_LINE_REGEX)
|
|
||||||
|
|
||||||
self.__ngram_entries = [{} for x in range(0, NGRAM)]
|
|
||||||
|
|
||||||
self.__vocab_keyset = marisa.Keyset()
|
|
||||||
self.__input_keyset = marisa.Keyset()
|
|
||||||
|
|
||||||
self.__vocab_trie = marisa.Trie()
|
|
||||||
self.__input_trie = marisa.Trie()
|
|
||||||
|
|
||||||
self.__min_cost = 0.0
|
|
||||||
|
|
||||||
def read(self):
|
|
||||||
print("reading N-grams")
|
|
||||||
self.__read_tries()
|
|
||||||
self.__read_ngrams()
|
|
||||||
print("min cost = %lf" % self.__min_cost)
|
|
||||||
|
|
||||||
def __read_tries(self):
|
|
||||||
while True:
|
|
||||||
line = self.__infile.readline()
|
|
||||||
if line == "":
|
|
||||||
break
|
|
||||||
if line.startswith("\\1-grams"):
|
|
||||||
break
|
|
||||||
|
|
||||||
unigram_count = 0
|
|
||||||
while True:
|
|
||||||
line = self.__infile.readline()
|
|
||||||
if line == "":
|
|
||||||
break
|
|
||||||
line = line.strip()
|
|
||||||
if line == "":
|
|
||||||
break
|
|
||||||
match = self.__ngram_line_regex.match(line)
|
|
||||||
if not match:
|
|
||||||
continue
|
|
||||||
strv = match.groups()
|
|
||||||
self.__vocab_keyset.push_back(strv[1])
|
|
||||||
if not strv[1] in ("<s>", "</s>", "<UNK>"):
|
|
||||||
if "/" not in strv[1]:
|
|
||||||
continue
|
|
||||||
(input, output) = strv[1].split("/")
|
|
||||||
self.__input_keyset.push_back(input)
|
|
||||||
|
|
||||||
self.__vocab_trie.build(self.__vocab_keyset)
|
|
||||||
self.__input_trie.build(self.__input_keyset)
|
|
||||||
|
|
||||||
def __read_ngrams(self):
|
|
||||||
self.__infile.seek(0)
|
|
||||||
for n in range(1, NGRAM + 1):
|
|
||||||
while True:
|
|
||||||
line = self.__infile.readline()
|
|
||||||
if line == "":
|
|
||||||
break
|
|
||||||
if line.startswith("\\%s-grams:" % n):
|
|
||||||
break
|
|
||||||
|
|
||||||
while True:
|
|
||||||
line = self.__infile.readline()
|
|
||||||
if line == "":
|
|
||||||
break
|
|
||||||
line = line.strip()
|
|
||||||
if line == "":
|
|
||||||
break
|
|
||||||
match = self.__ngram_line_regex.match(line)
|
|
||||||
if not match:
|
|
||||||
continue
|
|
||||||
strv = match.groups()
|
|
||||||
ngram = strv[1].split(" ")
|
|
||||||
ids = []
|
|
||||||
for word in ngram:
|
|
||||||
agent = marisa.Agent()
|
|
||||||
agent.set_query(word)
|
|
||||||
if not self.__vocab_trie.lookup(agent):
|
|
||||||
continue
|
|
||||||
ids.append(agent.key_id())
|
|
||||||
cost = float(strv[0])
|
|
||||||
if cost != -99 and cost < self.__min_cost:
|
|
||||||
self.__min_cost = cost
|
|
||||||
backoff = 0.0
|
|
||||||
if strv[2]:
|
|
||||||
backoff = float(strv[2])
|
|
||||||
self.__ngram_entries[n - 1][tuple(ids)] = (cost, backoff)
|
|
||||||
|
|
||||||
def write(self):
|
|
||||||
self.__min_cost = -8.0
|
|
||||||
self.__write_tries()
|
|
||||||
self.__write_ngrams()
|
|
||||||
|
|
||||||
def __write_tries(self):
|
|
||||||
self.__vocab_trie.save(self.__output_prefix + ".1gram.index")
|
|
||||||
self.__input_trie.save(self.__output_prefix + ".input")
|
|
||||||
|
|
||||||
def __write_ngrams(self):
|
|
||||||
def quantize(cost, min_cost):
|
|
||||||
return max(0, min(65535, int(cost * 65535 / min_cost)))
|
|
||||||
|
|
||||||
def cmp_header(a, b):
|
|
||||||
return cmp(a[0], b[0])
|
|
||||||
|
|
||||||
print("writing 1-gram file")
|
|
||||||
unigram_offsets = {}
|
|
||||||
unigram_file = open("%s.1gram" % self.__output_prefix, "wb")
|
|
||||||
offset = 0
|
|
||||||
for ids, value in sorted(self.__ngram_entries[0].items()):
|
|
||||||
unigram_offsets[ids[0]] = offset
|
|
||||||
s = struct.pack("=HHH",
|
|
||||||
quantize(value[0], self.__min_cost),
|
|
||||||
quantize(value[1], self.__min_cost),
|
|
||||||
0 # reserved
|
|
||||||
)
|
|
||||||
unigram_file.write(s)
|
|
||||||
offset += 1
|
|
||||||
unigram_file.close()
|
|
||||||
|
|
||||||
print("writing 2-gram file")
|
|
||||||
bigram_offsets = {}
|
|
||||||
bigram_file = open("%s.2gram" % self.__output_prefix, "wb")
|
|
||||||
keys = list(self.__ngram_entries[1].keys())
|
|
||||||
items = [(struct.pack("=LL", ids[1], unigram_offsets[ids[0]]), ids) for ids in keys]
|
|
||||||
offset = 0
|
|
||||||
for header, ids in sorted(items, cmp=cmp_header):
|
|
||||||
value = self.__ngram_entries[1][ids]
|
|
||||||
bigram_offsets[ids] = offset
|
|
||||||
s = struct.pack("=HH",
|
|
||||||
quantize(value[0], self.__min_cost),
|
|
||||||
quantize(value[1], self.__min_cost))
|
|
||||||
bigram_file.write(header + s)
|
|
||||||
offset += 1
|
|
||||||
bigram_file.close()
|
|
||||||
|
|
||||||
if len(self.__ngram_entries[2]) > 0:
|
|
||||||
print("writing 3-gram file")
|
|
||||||
trigram_file = open("%s.3gram" % self.__output_prefix, "wb")
|
|
||||||
keys = list(self.__ngram_entries[2].keys())
|
|
||||||
items = [(struct.pack("=LL", ids[2], bigram_offsets[(ids[0], ids[1])]), ids) for ids in keys]
|
|
||||||
for header, ids in sorted(items, cmp=cmp_header):
|
|
||||||
value = self.__ngram_entries[2][ids]
|
|
||||||
s = struct.pack("=H",
|
|
||||||
quantize(value[0], self.__min_cost))
|
|
||||||
trigram_file.write(header + s)
|
|
||||||
trigram_file.close()
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
import sys
|
|
||||||
import argparse
|
|
||||||
|
|
||||||
parser = argparse.ArgumentParser(description='sortlm')
|
|
||||||
parser.add_argument('infile', nargs='?', type=argparse.FileType('r'),
|
|
||||||
default=sys.stdin,
|
|
||||||
help='language model file')
|
|
||||||
parser.add_argument('output_prefix', metavar='OUTPUT_PREFIX', type=str,
|
|
||||||
help='output file prefix')
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
generator = SortedGenerator(args.infile, args.output_prefix)
|
|
||||||
generator.read();
|
|
||||||
generator.write();
|
|
||||||
@@ -1,40 +0,0 @@
|
|||||||
# Copyright 2026 Shin'ya Minazuki
|
|
||||||
EAPI=8
|
|
||||||
|
|
||||||
PYTHON_COMPAT=( python3_{11..12} )
|
|
||||||
|
|
||||||
inherit autotools python-single-r1
|
|
||||||
|
|
||||||
LIBKKC_PV="0.3.5"
|
|
||||||
|
|
||||||
DESCRIPTION="Language model data for app-i18n/libkkc"
|
|
||||||
HOMEPAGE="https://github.com/ueno/libkkc"
|
|
||||||
SRC_URI="https://github.com/ueno/libkkc/releases/download/v${LIBKKC_PV}/${P}.tar.xz"
|
|
||||||
|
|
||||||
LICENSE="GPL-3+"
|
|
||||||
SLOT="0"
|
|
||||||
KEYWORDS="~amd64"
|
|
||||||
|
|
||||||
REQUIRED_USE="${PYTHON_REQUIRED_USE}"
|
|
||||||
|
|
||||||
DEPEND="
|
|
||||||
dev-libs/marisa[python]
|
|
||||||
"
|
|
||||||
RDEPEND="${DEPEND}"
|
|
||||||
|
|
||||||
src_prepare() {
|
|
||||||
default
|
|
||||||
cp -f "${FILESDIR}/genfilter.py" "${S}/tools/genfilter.py"
|
|
||||||
cp -f "${FILESDIR}/sortlm.py" "${S}/tools/sortlm.py"
|
|
||||||
python_fix_shebang tools/genfilter.py
|
|
||||||
python_fix_shebang tools/sortlm.py
|
|
||||||
eautoreconf
|
|
||||||
}
|
|
||||||
|
|
||||||
src_configure() {
|
|
||||||
econf
|
|
||||||
}
|
|
||||||
|
|
||||||
src_install() {
|
|
||||||
emake DESTDIR="${D}" install
|
|
||||||
}
|
|
||||||
@@ -1,11 +0,0 @@
|
|||||||
<?xml version="1.0" encoding="UTF-8"?>
|
|
||||||
<!DOCTYPE pkgmetadata SYSTEM "https://www.gentoo.org/dtd/metadata.dtd">
|
|
||||||
<pkgmetadata>
|
|
||||||
<maintainer type="person">
|
|
||||||
<email>shinyoukai@laidback.moe</email>
|
|
||||||
<name>Shin'ya Minazuki</name>
|
|
||||||
</maintainer>
|
|
||||||
<upstream>
|
|
||||||
<remote-id type="github">ueno/libkkc</remote-id>
|
|
||||||
</upstream>
|
|
||||||
</pkgmetadata>
|
|
||||||
Reference in New Issue
Block a user