app-i18n/libkkc-data: resuming work on it when it becomes feasible

Signed-off-by: Shin'ya Minazuki <shinyoukai@laidback.moe>
2026-04-10 09:47:38 -03:00
parent 8e8530ecf2
commit 1373951a1c
3 changed files with 320 additions and 3 deletions
--- a/app-i18n/libkkc-data/files/genfilter.py
+++ b/app-i18n/libkkc-data/files/genfilter.py
@@ -0,0 +1,121 @@
+#!/usr/bin/python3.13
+
+# Copyright (C) 2011-2013 Daiki Ueno <ueno@gnu.org>
+# Copyright (C) 2011-2013 Red Hat, Inc.
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+import os
+import mmap
+import math
+import struct
+
+ERROR_RATE = 0.25
+
+def murmur_hash3_32(b0, b1, seed):
+    h1 = seed
+
+    c1 = 0xcc9e2d51
+    c2 = 0x1b873593
+
+    # body: b0
+    b0 *= c1
+    b0 &= 0xFFFFFFFF
+    b0 = (b0 << 15) | (b0 >> (32 - 15))
+    b0 &= 0xFFFFFFFF
+    b0 *= c2
+    b0 &= 0xFFFFFFFF
+
+    h1 ^= b0
+    h1 &= 0xFFFFFFFF
+    h1 = (h1 << 13) | (h1 >> (32 - 13)) 
+    h1 &= 0xFFFFFFFF
+    h1 = h1 * 5 + 0xe6546b64
+    h1 &= 0xFFFFFFFF
+
+    # body: b1
+    b1 *= c1
+    b1 &= 0xFFFFFFFF
+    b1 = (b1 << 15) | (b1 >> (32 - 15))
+    b1 &= 0xFFFFFFFF
+    b1 *= c2
+    b1 &= 0xFFFFFFFF
+
+    h1 ^= b1
+    h1 &= 0xFFFFFFFF
+    h1 = (h1 << 13) | (h1 >> (32 - 13)) 
+    h1 &= 0xFFFFFFFF
+    h1 = h1 * 5 + 0xe6546b64
+    h1 &= 0xFFFFFFFF
+
+    # No tail processing needed.
+
+    # fmix
+    h1 ^= 8
+    h1 &= 0xFFFFFFFF
+    h1 ^= h1 >> 16
+    h1 &= 0xFFFFFFFF
+    h1 *= 0x85ebca6b
+    h1 &= 0xFFFFFFFF
+    h1 ^= h1 >> 13
+    h1 &= 0xFFFFFFFF
+    h1 *= 0xc2b2ae35
+    h1 &= 0xFFFFFFFF
+    h1 ^= h1 >> 16
+    h1 &= 0xFFFFFFFF
+    return h1
+
+class FilterGenerator(object):
+    def __init__(self, infile, outfile, record_size):
+        self.infile = infile
+        self.outfile = outfile
+        self.record_size = record_size
+
+    def generate(self):
+        size = os.fstat(self.infile.fileno()).st_size
+        n = size / self.record_size
+        m = int(math.ceil(-n*math.log10(ERROR_RATE) /
+                          math.pow(math.log10(2), 2)))
+        m = (m/8 + 1)*8
+        inmem = mmap.mmap(self.infile.fileno(),
+                          size,
+                          access=mmap.ACCESS_READ)
+        outmem = bytearray(m/8)
+        for i in range(0, n):
+            offset = i*self.record_size
+            b0, b1 = struct.unpack("=LL", inmem[offset:offset+8])
+            for k in range(0, 4):
+                h = murmur_hash3_32(b0, b1, k)
+                h = int(h * (m / float(0xFFFFFFFF)))
+                outmem[h/8] |= (1 << (h%8))
+        inmem.close()
+        # Convert bytearray to str, for Python 2.6 compatibility.
+        self.outfile.write(str(outmem))
+
+if __name__ == '__main__':
+    import sys
+    import argparse
+
+    parser = argparse.ArgumentParser(description='filter')
+    parser.add_argument('infile', type=argparse.FileType('r'),
+                        help='input file')
+    parser.add_argument('outfile', type=argparse.FileType('w'),
+                        help='output file')
+    parser.add_argument('record_size', type=int,
+                        help='record size')
+    args = parser.parse_args()
+    generator = FilterGenerator(args.infile,
+                                args.outfile,
+                                args.record_size)
+    generator.generate()
--- a/app-i18n/libkkc-data/files/sortlm.py
+++ b/app-i18n/libkkc-data/files/sortlm.py
@@ -0,0 +1,188 @@
+#!/usr/bin/python
+
+# Copyright (C) 2011-2013 Daiki Ueno <ueno@gnu.org>
+# Copyright (C) 2011-2013 Red Hat, Inc.
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+import struct
+import marisa
+import re
+
+NGRAM = 3
+NGRAM_LINE_REGEX = '^([-0-9.]+)[ \t]+([^\t]+?)(?:[ \t]+([-0-9.]+))?$'
+
+class SortedGenerator(object):
+    def __init__(self, infile, output_prefix):
+        self.__infile = infile
+        self.__output_prefix = output_prefix
+        self.__ngram_line_regex = re.compile(NGRAM_LINE_REGEX)
+
+        self.__ngram_entries = [{} for x in range(0, NGRAM)]
+
+        self.__vocab_keyset = marisa.Keyset()
+        self.__input_keyset = marisa.Keyset()
+
+        self.__vocab_trie = marisa.Trie()
+        self.__input_trie = marisa.Trie()
+
+        self.__min_cost = 0.0
+
+    def read(self):
+        print("reading N-grams")
+        self.__read_tries()
+        self.__read_ngrams()
+        print("min cost = %lf" % self.__min_cost)
+
+    def __read_tries(self):
+        while True:
+            line = self.__infile.readline()
+            if line == "":
+                break
+            if line.startswith("\\1-grams"):
+                break
+
+        unigram_count = 0
+        while True:
+            line = self.__infile.readline()
+            if line == "":
+                break
+            line = line.strip()
+            if line == "":
+                break
+            match = self.__ngram_line_regex.match(line)
+            if not match:
+                continue
+            strv = match.groups()
+            self.__vocab_keyset.push_back(strv[1])
+            if not strv[1] in ("<s>", "</s>", "<UNK>"):
+                if "/" not in strv[1]:
+                    continue
+                (input, output) = strv[1].split("/")
+                self.__input_keyset.push_back(input)
+
+        self.__vocab_trie.build(self.__vocab_keyset)
+        self.__input_trie.build(self.__input_keyset)
+
+    def __read_ngrams(self):
+        self.__infile.seek(0)
+        for n in range(1, NGRAM + 1):
+            while True:
+                line = self.__infile.readline()
+                if line == "":
+                    break
+                if line.startswith("\\%s-grams:" % n):
+                    break
+
+            while True:
+                line = self.__infile.readline()
+                if line == "":
+                    break
+                line = line.strip()
+                if line == "":
+                    break
+                match = self.__ngram_line_regex.match(line)
+                if not match:
+                    continue
+                strv = match.groups()
+                ngram = strv[1].split(" ")
+                ids = []
+                for word in ngram:
+                    agent = marisa.Agent()
+                    agent.set_query(word)
+                    if not self.__vocab_trie.lookup(agent):
+                        continue
+                    ids.append(agent.key_id())
+                cost = float(strv[0])
+                if cost != -99 and cost < self.__min_cost:
+                    self.__min_cost = cost
+                backoff = 0.0
+                if strv[2]:
+                    backoff = float(strv[2])
+                self.__ngram_entries[n - 1][tuple(ids)] = (cost, backoff)
+
+    def write(self):
+        self.__min_cost = -8.0
+        self.__write_tries()
+        self.__write_ngrams()
+
+    def __write_tries(self):
+        self.__vocab_trie.save(self.__output_prefix + ".1gram.index")
+        self.__input_trie.save(self.__output_prefix + ".input")
+
+    def __write_ngrams(self):
+        def quantize(cost, min_cost):
+            return max(0, min(65535, int(cost * 65535 / min_cost)))
+
+        def cmp_header(a, b):
+            return cmp(a[0], b[0])
+
+        print("writing 1-gram file")
+        unigram_offsets = {}
+        unigram_file = open("%s.1gram" % self.__output_prefix, "wb")
+        offset = 0
+        for ids, value in sorted(self.__ngram_entries[0].items()):
+            unigram_offsets[ids[0]] = offset
+            s = struct.pack("=HHH",
+                            quantize(value[0], self.__min_cost),
+                            quantize(value[1], self.__min_cost),
+                            0   # reserved
+                            )
+            unigram_file.write(s)
+            offset += 1
+        unigram_file.close()
+
+        print("writing 2-gram file")
+        bigram_offsets = {}
+        bigram_file = open("%s.2gram" % self.__output_prefix, "wb")
+        keys = list(self.__ngram_entries[1].keys())
+        items = [(struct.pack("=LL", ids[1], unigram_offsets[ids[0]]), ids) for ids in keys]
+        offset = 0
+        for header, ids in sorted(items, cmp=cmp_header):
+            value = self.__ngram_entries[1][ids]
+            bigram_offsets[ids] = offset
+            s = struct.pack("=HH",
+                            quantize(value[0], self.__min_cost),
+                            quantize(value[1], self.__min_cost))
+            bigram_file.write(header + s)
+            offset += 1
+        bigram_file.close()
+
+        if len(self.__ngram_entries[2]) > 0:
+            print("writing 3-gram file")
+            trigram_file = open("%s.3gram" % self.__output_prefix, "wb")
+            keys = list(self.__ngram_entries[2].keys())
+            items = [(struct.pack("=LL", ids[2], bigram_offsets[(ids[0], ids[1])]), ids) for ids in keys]
+            for header, ids in sorted(items, cmp=cmp_header):
+                value = self.__ngram_entries[2][ids]
+                s = struct.pack("=H",
+                                quantize(value[0], self.__min_cost))
+                trigram_file.write(header + s)
+            trigram_file.close()
+
+if __name__ == '__main__':
+    import sys
+    import argparse
+
+    parser = argparse.ArgumentParser(description='sortlm')
+    parser.add_argument('infile', nargs='?', type=argparse.FileType('r'),
+                        default=sys.stdin,
+                        help='language model file')
+    parser.add_argument('output_prefix', metavar='OUTPUT_PREFIX', type=str,
+                        help='output file prefix')
+    args = parser.parse_args()
+
+    generator = SortedGenerator(args.infile, args.output_prefix)
+    generator.read();
+    generator.write();
--- a/app-i18n/libkkc-data/libkkc-data-0.2.7.ebuild.bak
+++ b/app-i18n/libkkc-data/libkkc-data-0.2.7.ebuild.bak
@@ -1,8 +1,8 @@
 # Copyright 2026 Shin'ya Minazuki
 EAPI=8

+PYTHON_COMPAT=( python3_{11..12} )

-PYTHON_COMPAT=( python2_7 )
 inherit autotools python-single-r1

 LIBKKC_PV="0.3.5"
@@ -11,17 +11,21 @@ DESCRIPTION="Language model data for app-i18n/libkkc"
 HOMEPAGE="https://github.com/ueno/libkkc"
 SRC_URI="https://github.com/ueno/libkkc/releases/download/v${LIBKKC_PV}/${P}.tar.xz"

-LICENSE="GPL-2"
+LICENSE="GPL-3+"
 SLOT="0"
 KEYWORDS="~amd64"

 REQUIRED_USE="${PYTHON_REQUIRED_USE}"

-DEPEND="dev-libs/marisa"
+DEPEND="
+	dev-libs/marisa[python]
+"
 RDEPEND="${DEPEND}"

 src_prepare() {
 	default
+	cp -f "${FILESDIR}/genfilter.py" "${S}/tools/genfilter.py"
+	cp -f "${FILESDIR}/sortlm.py" "${S}/tools/sortlm.py"
 	python_fix_shebang tools/genfilter.py
 	python_fix_shebang tools/sortlm.py
 	eautoreconf
@@ -30,3 +34,7 @@ src_prepare() {
 src_configure() {
 	econf
 }
+
+src_install() {
+	emake DESTDIR="${D}" install
+}