diff --git a/app-i18n/libkkc-data/files/genfilter.py b/app-i18n/libkkc-data/files/genfilter.py new file mode 100644 index 0000000..a6db99d --- /dev/null +++ b/app-i18n/libkkc-data/files/genfilter.py @@ -0,0 +1,121 @@ +#!/usr/bin/python3.13 + +# Copyright (C) 2011-2013 Daiki Ueno +# Copyright (C) 2011-2013 Red Hat, Inc. + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +import os +import mmap +import math +import struct + +ERROR_RATE = 0.25 + +def murmur_hash3_32(b0, b1, seed): + h1 = seed + + c1 = 0xcc9e2d51 + c2 = 0x1b873593 + + # body: b0 + b0 *= c1 + b0 &= 0xFFFFFFFF + b0 = (b0 << 15) | (b0 >> (32 - 15)) + b0 &= 0xFFFFFFFF + b0 *= c2 + b0 &= 0xFFFFFFFF + + h1 ^= b0 + h1 &= 0xFFFFFFFF + h1 = (h1 << 13) | (h1 >> (32 - 13)) + h1 &= 0xFFFFFFFF + h1 = h1 * 5 + 0xe6546b64 + h1 &= 0xFFFFFFFF + + # body: b1 + b1 *= c1 + b1 &= 0xFFFFFFFF + b1 = (b1 << 15) | (b1 >> (32 - 15)) + b1 &= 0xFFFFFFFF + b1 *= c2 + b1 &= 0xFFFFFFFF + + h1 ^= b1 + h1 &= 0xFFFFFFFF + h1 = (h1 << 13) | (h1 >> (32 - 13)) + h1 &= 0xFFFFFFFF + h1 = h1 * 5 + 0xe6546b64 + h1 &= 0xFFFFFFFF + + # No tail processing needed. + + # fmix + h1 ^= 8 + h1 &= 0xFFFFFFFF + h1 ^= h1 >> 16 + h1 &= 0xFFFFFFFF + h1 *= 0x85ebca6b + h1 &= 0xFFFFFFFF + h1 ^= h1 >> 13 + h1 &= 0xFFFFFFFF + h1 *= 0xc2b2ae35 + h1 &= 0xFFFFFFFF + h1 ^= h1 >> 16 + h1 &= 0xFFFFFFFF + return h1 + +class FilterGenerator(object): + def __init__(self, infile, outfile, record_size): + self.infile = infile + self.outfile = outfile + self.record_size = record_size + + def generate(self): + size = os.fstat(self.infile.fileno()).st_size + n = size / self.record_size + m = int(math.ceil(-n*math.log10(ERROR_RATE) / + math.pow(math.log10(2), 2))) + m = (m/8 + 1)*8 + inmem = mmap.mmap(self.infile.fileno(), + size, + access=mmap.ACCESS_READ) + outmem = bytearray(m/8) + for i in range(0, n): + offset = i*self.record_size + b0, b1 = struct.unpack("=LL", inmem[offset:offset+8]) + for k in range(0, 4): + h = murmur_hash3_32(b0, b1, k) + h = int(h * (m / float(0xFFFFFFFF))) + outmem[h/8] |= (1 << (h%8)) + inmem.close() + # Convert bytearray to str, for Python 2.6 compatibility. + self.outfile.write(str(outmem)) + +if __name__ == '__main__': + import sys + import argparse + + parser = argparse.ArgumentParser(description='filter') + parser.add_argument('infile', type=argparse.FileType('r'), + help='input file') + parser.add_argument('outfile', type=argparse.FileType('w'), + help='output file') + parser.add_argument('record_size', type=int, + help='record size') + args = parser.parse_args() + generator = FilterGenerator(args.infile, + args.outfile, + args.record_size) + generator.generate() \ No newline at end of file diff --git a/app-i18n/libkkc-data/files/sortlm.py b/app-i18n/libkkc-data/files/sortlm.py new file mode 100644 index 0000000..3390611 --- /dev/null +++ b/app-i18n/libkkc-data/files/sortlm.py @@ -0,0 +1,188 @@ +#!/usr/bin/python + +# Copyright (C) 2011-2013 Daiki Ueno +# Copyright (C) 2011-2013 Red Hat, Inc. + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +import struct +import marisa +import re + +NGRAM = 3 +NGRAM_LINE_REGEX = '^([-0-9.]+)[ \t]+([^\t]+?)(?:[ \t]+([-0-9.]+))?$' + +class SortedGenerator(object): + def __init__(self, infile, output_prefix): + self.__infile = infile + self.__output_prefix = output_prefix + self.__ngram_line_regex = re.compile(NGRAM_LINE_REGEX) + + self.__ngram_entries = [{} for x in range(0, NGRAM)] + + self.__vocab_keyset = marisa.Keyset() + self.__input_keyset = marisa.Keyset() + + self.__vocab_trie = marisa.Trie() + self.__input_trie = marisa.Trie() + + self.__min_cost = 0.0 + + def read(self): + print("reading N-grams") + self.__read_tries() + self.__read_ngrams() + print("min cost = %lf" % self.__min_cost) + + def __read_tries(self): + while True: + line = self.__infile.readline() + if line == "": + break + if line.startswith("\\1-grams"): + break + + unigram_count = 0 + while True: + line = self.__infile.readline() + if line == "": + break + line = line.strip() + if line == "": + break + match = self.__ngram_line_regex.match(line) + if not match: + continue + strv = match.groups() + self.__vocab_keyset.push_back(strv[1]) + if not strv[1] in ("", "", ""): + if "/" not in strv[1]: + continue + (input, output) = strv[1].split("/") + self.__input_keyset.push_back(input) + + self.__vocab_trie.build(self.__vocab_keyset) + self.__input_trie.build(self.__input_keyset) + + def __read_ngrams(self): + self.__infile.seek(0) + for n in range(1, NGRAM + 1): + while True: + line = self.__infile.readline() + if line == "": + break + if line.startswith("\\%s-grams:" % n): + break + + while True: + line = self.__infile.readline() + if line == "": + break + line = line.strip() + if line == "": + break + match = self.__ngram_line_regex.match(line) + if not match: + continue + strv = match.groups() + ngram = strv[1].split(" ") + ids = [] + for word in ngram: + agent = marisa.Agent() + agent.set_query(word) + if not self.__vocab_trie.lookup(agent): + continue + ids.append(agent.key_id()) + cost = float(strv[0]) + if cost != -99 and cost < self.__min_cost: + self.__min_cost = cost + backoff = 0.0 + if strv[2]: + backoff = float(strv[2]) + self.__ngram_entries[n - 1][tuple(ids)] = (cost, backoff) + + def write(self): + self.__min_cost = -8.0 + self.__write_tries() + self.__write_ngrams() + + def __write_tries(self): + self.__vocab_trie.save(self.__output_prefix + ".1gram.index") + self.__input_trie.save(self.__output_prefix + ".input") + + def __write_ngrams(self): + def quantize(cost, min_cost): + return max(0, min(65535, int(cost * 65535 / min_cost))) + + def cmp_header(a, b): + return cmp(a[0], b[0]) + + print("writing 1-gram file") + unigram_offsets = {} + unigram_file = open("%s.1gram" % self.__output_prefix, "wb") + offset = 0 + for ids, value in sorted(self.__ngram_entries[0].items()): + unigram_offsets[ids[0]] = offset + s = struct.pack("=HHH", + quantize(value[0], self.__min_cost), + quantize(value[1], self.__min_cost), + 0 # reserved + ) + unigram_file.write(s) + offset += 1 + unigram_file.close() + + print("writing 2-gram file") + bigram_offsets = {} + bigram_file = open("%s.2gram" % self.__output_prefix, "wb") + keys = list(self.__ngram_entries[1].keys()) + items = [(struct.pack("=LL", ids[1], unigram_offsets[ids[0]]), ids) for ids in keys] + offset = 0 + for header, ids in sorted(items, cmp=cmp_header): + value = self.__ngram_entries[1][ids] + bigram_offsets[ids] = offset + s = struct.pack("=HH", + quantize(value[0], self.__min_cost), + quantize(value[1], self.__min_cost)) + bigram_file.write(header + s) + offset += 1 + bigram_file.close() + + if len(self.__ngram_entries[2]) > 0: + print("writing 3-gram file") + trigram_file = open("%s.3gram" % self.__output_prefix, "wb") + keys = list(self.__ngram_entries[2].keys()) + items = [(struct.pack("=LL", ids[2], bigram_offsets[(ids[0], ids[1])]), ids) for ids in keys] + for header, ids in sorted(items, cmp=cmp_header): + value = self.__ngram_entries[2][ids] + s = struct.pack("=H", + quantize(value[0], self.__min_cost)) + trigram_file.write(header + s) + trigram_file.close() + +if __name__ == '__main__': + import sys + import argparse + + parser = argparse.ArgumentParser(description='sortlm') + parser.add_argument('infile', nargs='?', type=argparse.FileType('r'), + default=sys.stdin, + help='language model file') + parser.add_argument('output_prefix', metavar='OUTPUT_PREFIX', type=str, + help='output file prefix') + args = parser.parse_args() + + generator = SortedGenerator(args.infile, args.output_prefix) + generator.read(); + generator.write(); \ No newline at end of file diff --git a/app-i18n/libkkc-data/libkkc-data-0.2.7.ebuild b/app-i18n/libkkc-data/libkkc-data-0.2.7.ebuild.bak similarity index 67% rename from app-i18n/libkkc-data/libkkc-data-0.2.7.ebuild rename to app-i18n/libkkc-data/libkkc-data-0.2.7.ebuild.bak index a98d643..b964a14 100644 --- a/app-i18n/libkkc-data/libkkc-data-0.2.7.ebuild +++ b/app-i18n/libkkc-data/libkkc-data-0.2.7.ebuild.bak @@ -1,8 +1,8 @@ # Copyright 2026 Shin'ya Minazuki EAPI=8 +PYTHON_COMPAT=( python3_{11..12} ) -PYTHON_COMPAT=( python2_7 ) inherit autotools python-single-r1 LIBKKC_PV="0.3.5" @@ -11,17 +11,21 @@ DESCRIPTION="Language model data for app-i18n/libkkc" HOMEPAGE="https://github.com/ueno/libkkc" SRC_URI="https://github.com/ueno/libkkc/releases/download/v${LIBKKC_PV}/${P}.tar.xz" -LICENSE="GPL-2" +LICENSE="GPL-3+" SLOT="0" KEYWORDS="~amd64" REQUIRED_USE="${PYTHON_REQUIRED_USE}" -DEPEND="dev-libs/marisa" +DEPEND=" + dev-libs/marisa[python] +" RDEPEND="${DEPEND}" src_prepare() { default + cp -f "${FILESDIR}/genfilter.py" "${S}/tools/genfilter.py" + cp -f "${FILESDIR}/sortlm.py" "${S}/tools/sortlm.py" python_fix_shebang tools/genfilter.py python_fix_shebang tools/sortlm.py eautoreconf @@ -30,3 +34,7 @@ src_prepare() { src_configure() { econf } + +src_install() { + emake DESTDIR="${D}" install +}