Give up on app-i18n/ibus-kkc

Signed-off-by: Shin'ya Minazuki <shinyoukai@laidback.moe>
2026-04-16 11:31:47 -03:00
parent 10423f8793
commit 2e070de620
5 changed files with 0 additions and 361 deletions
--- a/app-i18n/libkkc-data/Manifest
+++ b/app-i18n/libkkc-data/Manifest
@@ -1 +0,0 @@
 DIST libkkc-data-0.2.7.tar.xz 22262552 BLAKE2B 2c735ee9fabf8f8f201591c9ed584cece22ddcd15da5f36b39bb422b1bce1dbcbcd66f71b5713e2dd4c5e2862b06b014c24a4a3db63c86ecee20519434da9261 SHA512 61c0cd8c0fa41ed8df49cac6709eebb245cc965d7e192b1ba945e95f2fc46aca8aa48c16e1977a12c157c55dab6b9f4c30f4905806725eca6e697b762eb7cbd7
--- a/app-i18n/libkkc-data/files/genfilter.py
+++ b/app-i18n/libkkc-data/files/genfilter.py
@@ -1,121 +0,0 @@
 #!/usr/bin/python3.13
 # Copyright (C) 2011-2013 Daiki Ueno <ueno@gnu.org>
 # Copyright (C) 2011-2013 Red Hat, Inc.
 # This program is free software: you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
 # the Free Software Foundation, either version 3 of the License, or
 # (at your option) any later version.
 # This program is distributed in the hope that it will be useful,
 # but WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 # GNU General Public License for more details.
 # You should have received a copy of the GNU General Public License
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 import os
 import mmap
 import math
 import struct
 ERROR_RATE = 0.25
 def murmur_hash3_32(b0, b1, seed):
    h1 = seed
    c1 = 0xcc9e2d51
    c2 = 0x1b873593
    # body: b0
    b0 *= c1
    b0 &= 0xFFFFFFFF
    b0 = (b0 << 15) | (b0 >> (32 - 15))
    b0 &= 0xFFFFFFFF
    b0 *= c2
    b0 &= 0xFFFFFFFF
    h1 ^= b0
    h1 &= 0xFFFFFFFF
    h1 = (h1 << 13) | (h1 >> (32 - 13)) 
    h1 &= 0xFFFFFFFF
    h1 = h1 * 5 + 0xe6546b64
    h1 &= 0xFFFFFFFF
    # body: b1
    b1 *= c1
    b1 &= 0xFFFFFFFF
    b1 = (b1 << 15) | (b1 >> (32 - 15))
    b1 &= 0xFFFFFFFF
    b1 *= c2
    b1 &= 0xFFFFFFFF
    h1 ^= b1
    h1 &= 0xFFFFFFFF
    h1 = (h1 << 13) | (h1 >> (32 - 13)) 
    h1 &= 0xFFFFFFFF
    h1 = h1 * 5 + 0xe6546b64
    h1 &= 0xFFFFFFFF
    # No tail processing needed.
    # fmix
    h1 ^= 8
    h1 &= 0xFFFFFFFF
    h1 ^= h1 >> 16
    h1 &= 0xFFFFFFFF
    h1 *= 0x85ebca6b
    h1 &= 0xFFFFFFFF
    h1 ^= h1 >> 13
    h1 &= 0xFFFFFFFF
    h1 *= 0xc2b2ae35
    h1 &= 0xFFFFFFFF
    h1 ^= h1 >> 16
    h1 &= 0xFFFFFFFF
    return h1
 class FilterGenerator(object):
    def __init__(self, infile, outfile, record_size):
        self.infile = infile
        self.outfile = outfile
        self.record_size = record_size
    def generate(self):
        size = os.fstat(self.infile.fileno()).st_size
        n = size / self.record_size
        m = int(math.ceil(-n*math.log10(ERROR_RATE) /
                          math.pow(math.log10(2), 2)))
        m = (m/8 + 1)*8
        inmem = mmap.mmap(self.infile.fileno(),
                          size,
                          access=mmap.ACCESS_READ)
        outmem = bytearray(m/8)
        for i in range(0, n):
            offset = i*self.record_size
            b0, b1 = struct.unpack("=LL", inmem[offset:offset+8])
            for k in range(0, 4):
                h = murmur_hash3_32(b0, b1, k)
                h = int(h * (m / float(0xFFFFFFFF)))
                outmem[h/8] |= (1 << (h%8))
        inmem.close()
        # Convert bytearray to str, for Python 2.6 compatibility.
        self.outfile.write(str(outmem))
 if __name__ == '__main__':
    import sys
    import argparse
    parser = argparse.ArgumentParser(description='filter')
    parser.add_argument('infile', type=argparse.FileType('r'),
                        help='input file')
    parser.add_argument('outfile', type=argparse.FileType('w'),
                        help='output file')
    parser.add_argument('record_size', type=int,
                        help='record size')
    args = parser.parse_args()
    generator = FilterGenerator(args.infile,
                                args.outfile,
                                args.record_size)
    generator.generate()
--- a/app-i18n/libkkc-data/files/sortlm.py
+++ b/app-i18n/libkkc-data/files/sortlm.py
@@ -1,188 +0,0 @@
 #!/usr/bin/python
 # Copyright (C) 2011-2013 Daiki Ueno <ueno@gnu.org>
 # Copyright (C) 2011-2013 Red Hat, Inc.
 # This program is free software: you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
 # the Free Software Foundation, either version 3 of the License, or
 # (at your option) any later version.
 # This program is distributed in the hope that it will be useful,
 # but WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 # GNU General Public License for more details.
 # You should have received a copy of the GNU General Public License
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 import struct
 import marisa
 import re
 NGRAM = 3
 NGRAM_LINE_REGEX = '^([-0-9.]+)[ \t]+([^\t]+?)(?:[ \t]+([-0-9.]+))?$'
 class SortedGenerator(object):
    def __init__(self, infile, output_prefix):
        self.__infile = infile
        self.__output_prefix = output_prefix
        self.__ngram_line_regex = re.compile(NGRAM_LINE_REGEX)
        self.__ngram_entries = [{} for x in range(0, NGRAM)]
        self.__vocab_keyset = marisa.Keyset()
        self.__input_keyset = marisa.Keyset()
        self.__vocab_trie = marisa.Trie()
        self.__input_trie = marisa.Trie()
        self.__min_cost = 0.0
    def read(self):
        print("reading N-grams")
        self.__read_tries()
        self.__read_ngrams()
        print("min cost = %lf" % self.__min_cost)
    def __read_tries(self):
        while True:
            line = self.__infile.readline()
            if line == "":
                break
            if line.startswith("\\1-grams"):
                break
        unigram_count = 0
        while True:
            line = self.__infile.readline()
            if line == "":
                break
            line = line.strip()
            if line == "":
                break
            match = self.__ngram_line_regex.match(line)
            if not match:
                continue
            strv = match.groups()
            self.__vocab_keyset.push_back(strv[1])
            if not strv[1] in ("<s>", "</s>", "<UNK>"):
                if "/" not in strv[1]:
                    continue
                (input, output) = strv[1].split("/")
                self.__input_keyset.push_back(input)
        self.__vocab_trie.build(self.__vocab_keyset)
        self.__input_trie.build(self.__input_keyset)
    def __read_ngrams(self):
        self.__infile.seek(0)
        for n in range(1, NGRAM + 1):
            while True:
                line = self.__infile.readline()
                if line == "":
                    break
                if line.startswith("\\%s-grams:" % n):
                    break
            while True:
                line = self.__infile.readline()
                if line == "":
                    break
                line = line.strip()
                if line == "":
                    break
                match = self.__ngram_line_regex.match(line)
                if not match:
                    continue
                strv = match.groups()
                ngram = strv[1].split(" ")
                ids = []
                for word in ngram:
                    agent = marisa.Agent()
                    agent.set_query(word)
                    if not self.__vocab_trie.lookup(agent):
                        continue
                    ids.append(agent.key_id())
                cost = float(strv[0])
                if cost != -99 and cost < self.__min_cost:
                    self.__min_cost = cost
                backoff = 0.0
                if strv[2]:
                    backoff = float(strv[2])
                self.__ngram_entries[n - 1][tuple(ids)] = (cost, backoff)
    def write(self):
        self.__min_cost = -8.0
        self.__write_tries()
        self.__write_ngrams()
    def __write_tries(self):
        self.__vocab_trie.save(self.__output_prefix + ".1gram.index")
        self.__input_trie.save(self.__output_prefix + ".input")
    def __write_ngrams(self):
        def quantize(cost, min_cost):
            return max(0, min(65535, int(cost * 65535 / min_cost)))
        def cmp_header(a, b):
            return cmp(a[0], b[0])
        print("writing 1-gram file")
        unigram_offsets = {}
        unigram_file = open("%s.1gram" % self.__output_prefix, "wb")
        offset = 0
        for ids, value in sorted(self.__ngram_entries[0].items()):
            unigram_offsets[ids[0]] = offset
            s = struct.pack("=HHH",
                            quantize(value[0], self.__min_cost),
                            quantize(value[1], self.__min_cost),
                            0   # reserved
                            )
            unigram_file.write(s)
            offset += 1
        unigram_file.close()
        print("writing 2-gram file")
        bigram_offsets = {}
        bigram_file = open("%s.2gram" % self.__output_prefix, "wb")
        keys = list(self.__ngram_entries[1].keys())
        items = [(struct.pack("=LL", ids[1], unigram_offsets[ids[0]]), ids) for ids in keys]
        offset = 0
        for header, ids in sorted(items, cmp=cmp_header):
            value = self.__ngram_entries[1][ids]
            bigram_offsets[ids] = offset
            s = struct.pack("=HH",
                            quantize(value[0], self.__min_cost),
                            quantize(value[1], self.__min_cost))
            bigram_file.write(header + s)
            offset += 1
        bigram_file.close()
        if len(self.__ngram_entries[2]) > 0:
            print("writing 3-gram file")
            trigram_file = open("%s.3gram" % self.__output_prefix, "wb")
            keys = list(self.__ngram_entries[2].keys())
            items = [(struct.pack("=LL", ids[2], bigram_offsets[(ids[0], ids[1])]), ids) for ids in keys]
            for header, ids in sorted(items, cmp=cmp_header):
                value = self.__ngram_entries[2][ids]
                s = struct.pack("=H",
                                quantize(value[0], self.__min_cost))
                trigram_file.write(header + s)
            trigram_file.close()
 if __name__ == '__main__':
    import sys
    import argparse
    parser = argparse.ArgumentParser(description='sortlm')
    parser.add_argument('infile', nargs='?', type=argparse.FileType('r'),
                        default=sys.stdin,
                        help='language model file')
    parser.add_argument('output_prefix', metavar='OUTPUT_PREFIX', type=str,
                        help='output file prefix')
    args = parser.parse_args()
    generator = SortedGenerator(args.infile, args.output_prefix)
    generator.read();
    generator.write();
--- a/app-i18n/libkkc-data/libkkc-data-0.2.7.ebuild.bak
+++ b/app-i18n/libkkc-data/libkkc-data-0.2.7.ebuild.bak
@@ -1,40 +0,0 @@
 # Copyright 2026 Shin'ya Minazuki
 EAPI=8
 PYTHON_COMPAT=( python3_{11..12} )
 inherit autotools python-single-r1
 LIBKKC_PV="0.3.5"
 DESCRIPTION="Language model data for app-i18n/libkkc"
 HOMEPAGE="https://github.com/ueno/libkkc"
 SRC_URI="https://github.com/ueno/libkkc/releases/download/v${LIBKKC_PV}/${P}.tar.xz"
 LICENSE="GPL-3+"
 SLOT="0"
 KEYWORDS="~amd64"
 REQUIRED_USE="${PYTHON_REQUIRED_USE}"
 DEPEND="
 	dev-libs/marisa[python]
 "
 RDEPEND="${DEPEND}"
 src_prepare() {
 	default
 	cp -f "${FILESDIR}/genfilter.py" "${S}/tools/genfilter.py"
 	cp -f "${FILESDIR}/sortlm.py" "${S}/tools/sortlm.py"
 	python_fix_shebang tools/genfilter.py
 	python_fix_shebang tools/sortlm.py
 	eautoreconf
 }
 src_configure() {
 	econf
 }
 src_install() {
 	emake DESTDIR="${D}" install
 }
--- a/app-i18n/libkkc-data/metadata.xml
+++ b/app-i18n/libkkc-data/metadata.xml
@@ -1,11 +0,0 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <!DOCTYPE pkgmetadata SYSTEM "https://www.gentoo.org/dtd/metadata.dtd">
 <pkgmetadata>
 	<maintainer type="person">
 		<email>shinyoukai@laidback.moe</email>
 		<name>Shin'ya Minazuki</name>
 	</maintainer>
 	<upstream>
 		<remote-id type="github">ueno/libkkc</remote-id>
 	</upstream>
 </pkgmetadata>
		`@@ -1 +0,0 @@`
			`DIST libkkc-data-0.2.7.tar.xz 22262552 BLAKE2B 2c735ee9fabf8f8f201591c9ed584cece22ddcd15da5f36b39bb422b1bce1dbcbcd66f71b5713e2dd4c5e2862b06b014c24a4a3db63c86ecee20519434da9261 SHA512 61c0cd8c0fa41ed8df49cac6709eebb245cc965d7e192b1ba945e95f2fc46aca8aa48c16e1977a12c157c55dab6b9f4c30f4905806725eca6e697b762eb7cbd7`