diff --git a/src/kcharselect-data b/src/kcharselect-data index 45aa3ce..e8ca45f 100644 Binary files a/src/kcharselect-data and b/src/kcharselect-data differ diff --git a/src/kcharselect-generate-datafile.py b/src/kcharselect-generate-datafile.py index e3f0fc4..54e3751 100755 --- a/src/kcharselect-generate-datafile.py +++ b/src/kcharselect-generate-datafile.py @@ -1,880 +1,882 @@ #!/usr/bin/python3 # -*- coding: utf-8 -*- # # This script generates a data file containing all Unicode information needed # by KCharSelect. # ############################################################################## # Copyright (C) 2007 Daniel Laidig # Copyright (C) 2016 John Zaitseff # # This script is free software; you can redistribute it and/or modify it under # the terms of the GNU Library General Public License as published by the Free # Software Foundation; either version 2 of the License, or (at your option) # any later version. # # This script is distributed in the hope that it will be useful, but WITHOUT # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public # License for more details. # # You should have received a copy of the GNU Library General Public License # along with this library; see the file COPYING.LIB. If not, write to the # Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA # 02110-1301, USA. ############################################################################## # # The current directory must contain the following files that can be found at # http://www.unicode.org/Public/UNIDATA/: # - UnicodeData.txt # - Unihan_Readings.txt (you need to uncompress it from Unihan.zip) # - NamesList.txt # - Blocks.txt # # The generated file is named "kcharselect-data" and has to be put in # kwidgetsaddons/src. Additionally a translation dummy named # "kcharselect-translation.cpp" is generated and has to be placed in the same # directory. # # FILE STRUCTURE # # The generated file is a binary file. The first 40 bytes are the header and # contain the position of each part of the file. Each entry is uint32. # # pos content # 0 names strings begin # 4 names offsets begin # 8 details strings begin # 12 details offsets begin # 16 block strings begin # 20 block offsets begin # 24 section strings begin # 28 section offsets begin # 32 unihan strings begin # 36 unihan offsets begin # # The string parts always contain all strings in a row, followed by a 0x00 # byte. There is one exception: The data for seeAlso in details is only 2 # bytes (as is always is _one_ unicode character) and _not_ followed by a 0x00 # byte. # # The offset parts contain entries with a fixed length. Unicode characters # are always uint16 and offsets uint32. Offsets are positions in the data # file. # # names_offsets: # each entry 6 bytes # 16bit: unicode # 32bit: offset to name in names_strings # # names_strings: # the first byte is the category (same values as QChar::Category), # directly followed by the character name (terminated by 0x00) # # nameslist_offsets: # char, alias, alias_count, note, note_count, approxEquiv, approxEquiv_coutn, equiv, equiv_count, seeAlso, seeAlso_count # 16 32 8 32 8 32 8 32 8 32 8 # => each entry 27 bytes # # blocks_offsets: # each entry 4 bytes # 16bit: start unicode # 16bit: end unicode # Note that there is no string offset. # # section_offsets: # each entry 4 bytes # 16bit: section offset # 16bit: block offset # Note that these offsets are _not_ positions in the data file but indexes. # For example 0x0403 means the fourth section includes the third block. # # unihan_offsets: # each entry 30 bytes # 16bit: unicode # 32bit: offset to unihan_strings for Definition # 32bit: offset to unihan_strings for Cantonese # 32bit: offset to unihan_strings for Mandarin # 32bit: offset to unihan_strings for Tang # 32bit: offset to unihan_strings for Korean # 32bit: offset to unihan_strings for JapaneseKun # 32bit: offset to unihan_strings for JapaneseOn from struct import * import sys import re import io # Based on http://www.unicode.org/charts/, updated for Unicode 9.0 sectiondata = ''' SECTION European Scripts Basic Latin Latin-1 Supplement Latin Extended-A Latin Extended-B Latin Extended-C Latin Extended-D Latin Extended-E Latin Extended Additional Armenian Coptic Cyrillic Cyrillic Supplement Cyrillic Extended-A Cyrillic Extended-B Cyrillic Extended-C Georgian Georgian Supplement +Georgian Extended Glagolitic Greek and Coptic Greek Extended Ogham Runic SECTION African Scripts Bamum Ethiopic Ethiopic Supplement Ethiopic Extended Ethiopic Extended-A NKo Tifinagh Vai SECTION Middle Eastern Scripts Arabic Arabic Supplement Arabic Extended-A Arabic Presentation Forms-A Arabic Presentation Forms-B Hebrew Mandaic Samaritan Syriac Syriac Supplement SECTION Central Asian Scripts Mongolian Phags-pa Tibetan SECTION South Asian Scripts Bengali Common Indic Number Forms Devanagari Devanagari Extended Gujarati Gurmukhi Kannada Lepcha Limbu Malayalam Meetei Mayek Meetei Mayek Extensions Ol Chiki Oriya Saurashtra Sinhala Syloti Nagri Tamil Telugu Thaana Vedic Extensions SECTION Southeast Asian Scripts Cham Kayah Li Khmer Khmer Symbols Lao Myanmar Myanmar Extended-A Myanmar Extended-B New Tai Lue Tai Le Tai Tham Tai Viet Thai SECTION Indonesia and Oceania Scripts Balinese Batak Buginese Buhid Hanunoo Javanese Rejang Sundanese Sundanese Supplement Tagalog Tagbanwa SECTION East Asian Scripts Bopomofo Bopomofo Extended CJK Unified Ideographs CJK Unified Ideographs Extension A CJK Compatibility CJK Compatibility Ideographs CJK Compatibility Forms CJK Radicals Supplement CJK Strokes CJK Symbols and Punctuation Enclosed CJK Letters and Months Hangul Jamo Hangul Jamo Extended-A Hangul Jamo Extended-B Hangul Compatibility Jamo Hangul Syllables Hiragana Ideographic Description Characters Kanbun Kangxi Radicals Katakana Katakana Phonetic Extensions Lisu Yi Radicals Yi Syllables SECTION American Scripts Cherokee Cherokee Supplement Unified Canadian Aboriginal Syllabics Unified Canadian Aboriginal Syllabics Extended SECTION Symbols General Punctuation Alchemical Symbols Braille Patterns +Chess Symbols Control Pictures Currency Symbols Dingbats Domino Tiles Emoticons Enclosed Alphanumerics Enclosed Alphanumeric Supplement Enclosed Ideographic Supplement Mahjong Tiles Miscellaneous Symbols Miscellaneous Symbols and Pictographs Miscellaneous Technical Optical Character Recognition Ornamental Dingbats Playing Cards Small Form Variants Supplemental Punctuation Supplemental Symbols and Pictographs Transport and Map Symbols Vertical Forms Yijing Hexagram Symbols SECTION Mathematical Symbols Arrows Block Elements Box Drawing Geometric Shapes Geometric Shapes Extended Letterlike Symbols Mathematical Operators Miscellaneous Mathematical Symbols-A Miscellaneous Mathematical Symbols-B Miscellaneous Symbols and Arrows Number Forms Superscripts and Subscripts Supplemental Arrows-A Supplemental Arrows-B Supplemental Arrows-C Supplemental Mathematical Operators SECTION Phonetic Symbols IPA Extensions Modifier Tone Letters Phonetic Extensions Phonetic Extensions Supplement Spacing Modifier Letters SECTION Combining Diacritics Combining Diacritical Marks Combining Diacritical Marks Extended Combining Diacritical Marks Supplement Combining Diacritical Marks for Symbols Combining Half Marks SECTION Other Alphabetic Presentation Forms Halfwidth and Fullwidth Forms High Private Use Surrogates High Surrogates Low Surrogates Private Use Area Specials Variation Selectors ''' categoryMap = { # same values as QChar::Category "Mn": 1, "Mc": 2, "Me": 3, "Nd": 4, "Nl": 5, "No": 6, "Zs": 7, "Zl": 8, "Zp": 9, "Cc": 10, "Cf": 11, "Cs": 12, "Co": 13, "Cn": 14, "Lu": 15, "Ll": 16, "Lt": 17, "Lm": 18, "Lo": 19, "Pc": 20, "Pd": 21, "Ps": 22, "Pe": 23, "Pi": 24, "Pf": 25, "Po": 26, "Sm": 27, "Sc": 28, "Sk": 29, "So": 30 } # Temporary code point remapping # # Initial SMP support without needing a new data file format # - BMP U+Fxxx are remapped to U+Exxx # - SMP symbols U+1Fxxx are remapped to U+Fxxx # - Private Use Area is limited to U+F000 ... U+F8FF def remap(char): cp = int(char, 16) if cp >= 0xE000 and cp <= 0xFFFF: return "E"+char[1:] if cp >= 0x1F000 and cp <= 0x1FFFF: return char[1:] return char class Names: def __init__(self): self.names = [] self.controlpos = -1 def addName(self, uni, name, category): self.names.append([uni, name, category]) def calculateStringSize(self): size = 0 hadcontrol = False for entry in self.names: if entry[1] == "": if not hadcontrol: size += len(entry[1]) + 2 hadcontrol = True else: size += len(entry[1]) + 2 return size def calculateOffsetSize(self): return len(self.names)*6 def writeStrings(self, out, pos): hadcontrol = False for entry in self.names: if entry[1] == "": if not hadcontrol: out.write(pack("=b", entry[2])) out.write(entry[1].encode("utf-8") + b"\0") size = len(entry[1]) + 2 entry[1] = pos self.controlpos = pos pos += size hadcontrol = True else: entry[1] = self.controlpos else: out.write(pack("=b", entry[2])) out.write(entry[1].encode("utf-8") + b"\0") size = len(entry[1]) + 2 entry[1] = pos pos += size return pos def writeOffsets(self, out, pos): for entry in self.names: out.write(pack("=HI", int(entry[0], 16), entry[1])) pos += 6 return pos class Details: def __init__(self): self.details = {} def addEntry(self, char, category, text): if not char in self.details: self.details[char] = {} if not category in self.details[char]: self.details[char][category] = [] self.details[char][category].append(text) def calculateStringSize(self): size = 0 for char in self.details.values(): for cat in char.values(): for s in cat: if type(s) is str: size += len(s.encode("utf-8")) + 1 else: size += 2 return size def calculateOffsetSize(self): return len(self.details)*27 def writeStrings(self, out, pos): for char in self.details.values(): for cat in char.values(): for i in range(0, len(cat)): s = cat[i] if type(s) is str: out.write(s.encode("utf-8") + b"\0") size = len(s.encode("utf-8")) + 1 else: out.write(pack("=H", s)) size = 2 cat[i] = pos pos += size return pos def writeOffsets(self, out, pos): for char in self.details.keys(): alias = 0 alias_count = 0 note = 0 note_count = 0 approxEquiv = 0 approxEquiv_count = 0 equiv = 0 equiv_count = 0 seeAlso = 0 seeAlso_count = 0 if "alias" in self.details[char]: alias = self.details[char]["alias"][0] alias_count = len(self.details[char]["alias"]) if "note" in self.details[char]: note = self.details[char]["note"][0] note_count = len(self.details[char]["note"]) if "approxEquiv" in self.details[char]: approxEquiv = self.details[char]["approxEquiv"][0] approxEquiv_count = len(self.details[char]["approxEquiv"]) if "equiv" in self.details[char]: equiv = self.details[char]["equiv"][0] equiv_count = len(self.details[char]["equiv"]) if "seeAlso" in self.details[char]: seeAlso = self.details[char]["seeAlso"][0] seeAlso_count = len(self.details[char]["seeAlso"]) out.write(pack("=HIbIbIbIbIb", char, alias, alias_count, note, note_count, approxEquiv, approxEquiv_count, equiv, equiv_count, seeAlso, seeAlso_count)) pos += 27 return pos class SectionsBlocks: def __init__(self): self.sections = [] self.blocks = [] self.blockList = [] self.sectionList = [] def addBlock(self, begin, end, name): self.blocks.append([begin, end, name]) self.blockList.append(name) def addSection(self, section, block): self.sections.append([section, block]) if not section in self.sectionList: self.sectionList.append(section) def calculateBlockStringSize(self): size = 0 for block in self.blocks: size += len(block[2]) + 1 return size def calculateBlockOffsetSize(self): return len(self.blocks) * 4 def calculateSectionStringSize(self): size = 0 lastsection = "" for section in self.sections: if section[0] != lastsection: size += len(section[0]) + 1 lastsection = section[0] return size def calculateSectionOffsetSize(self): return len(self.sections) * 4 def writeBlockStrings(self, out, pos): index = 0 for block in self.blocks: out.write(block[2].encode("utf-8") + b"\0") size = len(block[2].encode("utf-8")) + 1 found = False for section in self.sections: if section[1] == block[2]: print("found", section) section[1] = index found = True if not found: print("Error: Did not find any category for block \""+block[2]+"\"") sys.exit(1) block[2] = index pos += size index += 1 return pos def writeBlockOffsets(self, out, pos): for block in self.blocks: out.write(pack("=HH", int(block[0], 16), int(block[1], 16))) pos += 4 return pos def writeSectionStrings(self, out, pos): lastsection = "" lastpos = 0 index = -1 for section in self.sections: if section[0] != lastsection: index += 1 lastsection = section[0] out.write(section[0].encode("utf-8") + b"\0") size = len(section[0].encode("utf-8")) + 1 section[0] = index lastpos = pos pos += size else: section[0] = index return pos def writeSectionOffsets(self, out, pos): for section in self.sections: out.write(pack("=HH", section[0], section[1])) pos += 4 return pos def getBlockList(self): return self.blockList def getSectionList(self): return self.sectionList class Unihan: def __init__(self): self.unihan = {} def addUnihan(self, uni, category, value): uni = int(uni, 16) if category != "kDefinition" and category != "kCantonese" and category != "kMandarin" and category != "kTang" and category != "kKorean" and category != "kJapaneseKun" and category != "kJapaneseOn": return if not uni in self.unihan: self.unihan[uni] = [None, None, None, None, None, None, None] if category == "kDefinition": self.unihan[uni][0] = value elif category == "kCantonese": self.unihan[uni][1] = value elif category == "kMandarin": self.unihan[uni][2] = value elif category == "kTang": self.unihan[uni][3] = value elif category == "kKorean": self.unihan[uni][4] = value elif category == "kJapaneseKun": self.unihan[uni][5] = value elif category == "kJapaneseOn": self.unihan[uni][6] = value def calculateStringSize(self): size = 0 for char in self.unihan.keys(): for entry in self.unihan[char]: if entry != None: size += len(entry.encode("utf-8")) + 1 return size def calculateOffsetSize(self): return len(self.unihan) * 30 def writeStrings(self, out, pos): for char in self.unihan.keys(): for i in range(0, 7): if self.unihan[char][i] != None: out.write(self.unihan[char][i].encode("utf-8") + b"\0") size = len(self.unihan[char][i].encode("utf-8")) + 1 self.unihan[char][i] = pos pos += size return pos def writeOffsets(self, out, pos): for char in self.unihan.keys(): out.write(pack("=H", char)) for i in range(0, 7): if self.unihan[char][i] != None: out.write(pack("=I", self.unihan[char][i])) else: out.write(pack("=I", 0)) pos += 30 return pos class Parser: def parseUnicodeData(self, inUnicodeData, names): regexp = re.compile(r'^([^;]+);([^;]+);([^;]+)') for line in inUnicodeData: line = line[:-1] m = regexp.match(line) if not m: continue uni = remap(m.group(1)) name = m.group(2) category = m.group(3) if len(uni) > 4: continue names.addName(uni, name, categoryMap[category]) def parseDetails(self, inNamesList, details): invalidRegexp = re.compile(r'^@') unicodeRegexp = re.compile(r'^([0-9A-F]+)') aliasRegexp = re.compile(r'^\s+=\s+(.+)$') #equal seeAlsoRegexp1 = re.compile(r'^\s+x\s+.*\s([0-9A-F]{4,6})\)$') #ex seeAlsoRegexp2 = re.compile(r'^\s+x\s+([0-9A-F]{4,6})$') #ex noteRegexp = re.compile(r'^\s+\*\s+(.+)$') #star approxEquivalentRegexp = re.compile(r'^\s+#\s+(.+)$') #pound equivalentRegexp = re.compile(r'^\s+:\s+(.+)$') #colon drop = 0 currChar = 0 for line in inNamesList: line = line[:-1] m1 = unicodeRegexp.match(line) m2 = aliasRegexp.match(line) m3 = noteRegexp.match(line) m4 = approxEquivalentRegexp.match(line) m5 = equivalentRegexp.match(line) m6 = seeAlsoRegexp1.match(line) m7 = seeAlsoRegexp2.match(line) if invalidRegexp.match(line): continue elif m1: mg1 = remap(m1.group(1)) currChar = int(mg1, 16) if len(mg1) > 4: drop = 1 continue elif drop == 1: continue elif m2: value = m2.group(1) details.addEntry(currChar, "alias", value) elif m3: value = m3.group(1) details.addEntry(currChar, "note", value) elif m4: value = m4.group(1) details.addEntry(currChar, "approxEquiv", value) elif m5: value = m5.group(1) details.addEntry(currChar, "equiv", value) elif m6: value = int(remap(m6.group(1)), 16) if value < 0x10000: details.addEntry(currChar, "seeAlso", value) elif m7: value = int(remap(m7.group(1)), 16) if value < 0x10000: details.addEntry(currChar, "seeAlso", value) def parseBlocks(self, inBlocks, sectionsBlocks): regexp = re.compile(r'^([0-9A-F]+)\.\.([0-9A-F]+); (.+)$') for line in inBlocks: line = line[:-1] m = regexp.match(line) if not m: continue m1 = remap(m.group(1)) m2 = remap(m.group(2)) if len(m1) > 4: continue sectionsBlocks.addBlock(m1, m2, m.group(3)) def parseSections(self, inSections, sectionsBlocks): currSection = "" for line in inSections: line = line[:-1] if len(line) == 0: continue temp = line.split(" ") if temp[0] == "SECTION": currSection = line[8:] elif currSection != "": sectionsBlocks.addSection(currSection, line) else: print("error in data file") sys.exit(1) def parseUnihan(self, inUnihan, unihan): regexp = re.compile(r'^U\+([0-9A-F]+)\s+([^\s]+)\s+(.+)$') count = 0 for line in inUnihan: if count % 100000 == 0: print("\b."); sys.stdout.flush() count += 1 line = line[:-1] m = regexp.match(line) if not m: continue if len(remap(m.group(1))) <= 4: unihan.addUnihan(remap(m.group(1)), m.group(2), m.group(3)) def writeTranslationDummy(out, data): out.write(b"""/* This file is part of the KDE libraries Copyright (C) 2007 Daniel Laidig Copyright (C) 2016 John Zaitseff This library is free software; you can redistribute it and/or modify it under the terms of the GNU Library General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public License for more details. You should have received a copy of the GNU Library General Public License along with this library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. This file is autogenerated by kcharselect/kcharselect-generate-datafile.py */\n\n""") for group in data: for entry in group[1]: out.write(b"QT_TRANSLATE_NOOP3(\"KCharSelectData\", \""+entry.encode("utf-8")+b"\", \""+group[0].encode("utf-8")+b"\");\n") out = open("kcharselect-data", "wb") outTranslationDummy = open("kcharselect-translation.cpp", "wb") inUnicodeData = open("UnicodeData.txt", "r") inNamesList = open("NamesList.txt", "r") inBlocks = open("Blocks.txt", "r") inSections = io.StringIO(sectiondata) inUnihan = open("Unihan_Readings.txt", "r") if calcsize('=H') != 2 or calcsize('=I') != 4: print("Error: Sizes of ushort and uint are not 16 and 32 bit as expected") sys.exit(1) names = Names() details = Details() sectionsBlocks = SectionsBlocks() unihan = Unihan() parser = Parser() print("========== parsing files ===================") parser.parseUnicodeData(inUnicodeData, names) print("."); sys.stdout.flush() parser.parseDetails(inNamesList, details) print("\b."); sys.stdout.flush() parser.parseBlocks(inBlocks, sectionsBlocks) print("\b."); sys.stdout.flush() parser.parseSections(inSections, sectionsBlocks) print("\b."); sys.stdout.flush() parser.parseUnihan(inUnihan, unihan) print("\b."); sys.stdout.flush() print("done.") pos = 0 #write header, size: 40 bytes print("========== writing header ==================") out.write(pack("=I", 40)) print("names strings begin", 40) namesOffsetBegin = names.calculateStringSize() + 40 out.write(pack("=I", namesOffsetBegin)) print("names offsets begin", namesOffsetBegin) detailsStringBegin = namesOffsetBegin + names.calculateOffsetSize() out.write(pack("=I", detailsStringBegin)) print("details strings begin", detailsStringBegin) detailsOffsetBegin = detailsStringBegin + details.calculateStringSize() out.write(pack("=I", detailsOffsetBegin)) print("details offsets begin", detailsOffsetBegin) blocksStringBegin = detailsOffsetBegin + details.calculateOffsetSize() out.write(pack("=I", blocksStringBegin)) print("block strings begin", blocksStringBegin) blocksOffsetBegin = blocksStringBegin + sectionsBlocks.calculateBlockStringSize() out.write(pack("=I", blocksOffsetBegin)) print("block offsets begin", blocksOffsetBegin) sectionStringBegin = blocksOffsetBegin + sectionsBlocks.calculateBlockOffsetSize() out.write(pack("=I", sectionStringBegin)) print("section strings begin", sectionStringBegin) sectionOffsetBegin = sectionStringBegin + sectionsBlocks.calculateSectionStringSize() out.write(pack("=I", sectionOffsetBegin)) print("section offsets begin", sectionOffsetBegin) unihanStringBegin = sectionOffsetBegin + sectionsBlocks.calculateSectionOffsetSize() out.write(pack("=I", unihanStringBegin)) print("unihan strings begin", unihanStringBegin) unihanOffsetBegin = unihanStringBegin + unihan.calculateStringSize() out.write(pack("=I", unihanOffsetBegin)) print("unihan offsets begin", unihanOffsetBegin) end = unihanOffsetBegin + unihan.calculateOffsetSize() print("end should be", end) pos += 40 print("========== writing data ====================") pos = names.writeStrings(out, pos) print("names strings written, position", pos) pos = names.writeOffsets(out, pos) print("names offsets written, position", pos) pos = details.writeStrings(out, pos) print("details strings written, position", pos) pos = details.writeOffsets(out, pos) print("details offsets written, position", pos) pos = sectionsBlocks.writeBlockStrings(out, pos) print("block strings written, position", pos) pos = sectionsBlocks.writeBlockOffsets(out, pos) print("block offsets written, position", pos) pos = sectionsBlocks.writeSectionStrings(out, pos) print("section strings written, position", pos) pos = sectionsBlocks.writeSectionOffsets(out, pos) print("section offsets written, position", pos) pos = unihan.writeStrings(out, pos) print("unihan strings written, position", pos) pos = unihan.writeOffsets(out, pos) print("unihan offsets written, position", pos) print("========== writing translation dummy ======") translationData = [["KCharSelect section name", sectionsBlocks.getSectionList()], ["KCharselect unicode block name",sectionsBlocks.getBlockList()]] writeTranslationDummy(outTranslationDummy, translationData) print("done. make sure to copy both kcharselect-data and kcharselect-translation.cpp.") diff --git a/src/kcharselectdata.cpp b/src/kcharselectdata.cpp index 57f343b..f54ffc4 100644 --- a/src/kcharselectdata.cpp +++ b/src/kcharselectdata.cpp @@ -1,1046 +1,1046 @@ /* This file is part of the KDE libraries Copyright (C) 2007 Daniel Laidig This library is free software; you can redistribute it and/or modify it under the terms of the GNU Library General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public License for more details. You should have received a copy of the GNU Library General Public License along with this library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ #include "kcharselectdata_p.h" #include #include #include #include #include #include #include #include #include #include /* constants for hangul (de)composition, see UAX #15 */ #define SBase 0xAC00 #define LBase 0x1100 #define VBase 0x1161 #define TBase 0x11A7 #define LCount 19 #define VCount 21 #define TCount 28 #define NCount (VCount * TCount) #define SCount (LCount * NCount) class RunIndexCreation : public QFutureInterface, public QRunnable { public: RunIndexCreation(KCharSelectData *data, const QByteArray &dataFile) : m_data(data), m_dataFile(dataFile) { } QFuture start() { setRunnable(this); reportStarted(); QFuture f = this->future(); QThreadPool::globalInstance()->start(this); return f; } void run() override { Index index = m_data->createIndex(m_dataFile); reportResult(index); reportFinished(); } private: KCharSelectData *m_data; QByteArray m_dataFile; }; static const char JAMO_L_TABLE[][4] = { "G", "GG", "N", "D", "DD", "R", "M", "B", "BB", "S", "SS", "", "J", "JJ", "C", "K", "T", "P", "H" }; static const char JAMO_V_TABLE[][4] = { "A", "AE", "YA", "YAE", "EO", "E", "YEO", "YE", "O", "WA", "WAE", "OE", "YO", "U", "WEO", "WE", "WI", "YU", "EU", "YI", "I" }; static const char JAMO_T_TABLE[][4] = { "", "G", "GG", "GS", "N", "NJ", "NH", "D", "L", "LG", "LM", "LB", "LS", "LT", "LP", "LH", "M", "B", "BS", "S", "SS", "NG", "J", "C", "K", "T", "P", "H" }; bool KCharSelectData::openDataFile() { if (!dataFile.isEmpty()) { return true; } else { QFile file(QStandardPaths::locate(QStandardPaths::GenericDataLocation, QStringLiteral("kf5/kcharselect/kcharselect-data"))); if (!file.open(QIODevice::ReadOnly)) { return false; } dataFile = file.readAll(); file.close(); if (dataFile.size() < 40) { dataFile.clear(); return false; } const uchar *data = reinterpret_cast(dataFile.constData()); const quint32 offsetBegin = qFromLittleEndian(data + 20); const quint32 offsetEnd = qFromLittleEndian(data + 24); uint blocks = (offsetEnd - offsetBegin) / 4; if (blocks <= 167) { // maximum possible number of blocks in BMP // no remapping remapType = -1; - } else if (blocks >= 174 && blocks <= 175) { + } else if (blocks >= 174 && blocks <= 177) { // remapping introduced in 5.25 remapType = 0; } else { // unknown remapping, abort dataFile.clear(); return false; } futureIndex = (new RunIndexCreation(this, dataFile))->start(); return true; } } // Temporary remapping code points <-> 16 bit database codes // See kcharselect-generate-datafile.py for details quint16 KCharSelectData::mapCodePointToDataBase(uint code) const { if (remapType == 0) { if (code >= 0xE000 && code <= 0xEFFF) { return 0xFFFF; } if (code >= 0xF000 && code <= 0xFFFF) { return code - 0x1000; } if (code >= 0x1F000 && code <= 0x1FFFF) { return code - 0x10000; } } if (code >= 0x10000) { return 0xFFFF; } return code; } uint KCharSelectData::mapDataBaseToCodePoint(quint16 code) const { if (remapType == 0) { if (code >= 0xE000 && code <= 0xEFFF) { return code + 0x1000; } if (code >= 0xF000) { return code + 0x10000; } } return code; } quint32 KCharSelectData::getDetailIndex(uint c) const { const uchar *data = reinterpret_cast(dataFile.constData()); // Convert from little-endian, so that this code works on PPC too. // http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=482286 const quint32 offsetBegin = qFromLittleEndian(data + 12); const quint32 offsetEnd = qFromLittleEndian(data + 16); int min = 0; int mid; int max = ((offsetEnd - offsetBegin) / 27) - 1; quint16 unicode = mapCodePointToDataBase(c); if (unicode == 0xFFFF) { return 0; } static quint16 most_recent_searched; static quint32 most_recent_result; if (unicode == most_recent_searched) { return most_recent_result; } most_recent_searched = unicode; while (max >= min) { mid = (min + max) / 2; const quint16 midUnicode = qFromLittleEndian(data + offsetBegin + mid * 27); if (unicode > midUnicode) { min = mid + 1; } else if (unicode < midUnicode) { max = mid - 1; } else { most_recent_result = offsetBegin + mid * 27; return most_recent_result; } } most_recent_result = 0; return 0; } QString KCharSelectData::formatCode(uint code, int length, const QString &prefix, int base) { QString s = QString::number(code, base).toUpper(); while (s.size() < length) { s.prepend(QLatin1Char('0')); } s.prepend(prefix); return s; } QVector KCharSelectData::blockContents(int block) { if (!openDataFile()) { return QVector(); } const uchar *data = reinterpret_cast(dataFile.constData()); const quint32 offsetBegin = qFromLittleEndian(data + 20); const quint32 offsetEnd = qFromLittleEndian(data + 24); int max = ((offsetEnd - offsetBegin) / 4) - 1; QVector res; if (block > max) { return res; } quint16 unicodeBegin = qFromLittleEndian(data + offsetBegin + block * 4); quint16 unicodeEnd = qFromLittleEndian(data + offsetBegin + block * 4 + 2); while (unicodeBegin < unicodeEnd) { res.append(mapDataBaseToCodePoint(unicodeBegin)); unicodeBegin++; } res.append(mapDataBaseToCodePoint(unicodeBegin)); // Be carefull when unicodeEnd==0xffff return res; } QVector KCharSelectData::sectionContents(int section) { if (!openDataFile()) { return QVector(); } const uchar *data = reinterpret_cast(dataFile.constData()); const quint32 offsetBegin = qFromLittleEndian(data + 28); const quint32 offsetEnd = qFromLittleEndian(data + 32); int max = ((offsetEnd - offsetBegin) / 4) - 1; QVector res; if (section > max) { return res; } for (int i = 0; i <= max; i++) { const quint16 currSection = qFromLittleEndian(data + offsetBegin + i * 4); if (currSection == section) { res.append(qFromLittleEndian(data + offsetBegin + i * 4 + 2)); } } return res; } QStringList KCharSelectData::sectionList() { if (!openDataFile()) { return QStringList(); } const uchar *udata = reinterpret_cast(dataFile.constData()); const quint32 stringBegin = qFromLittleEndian(udata + 24); const quint32 stringEnd = qFromLittleEndian(udata + 28); const char *data = dataFile.constData(); QStringList list; quint32 i = stringBegin; while (i < stringEnd) { list.append(QCoreApplication::translate("KCharSelectData", data + i, "KCharSelect section name")); i += qstrlen(data + i) + 1; } return list; } QString KCharSelectData::block(uint c) { return blockName(blockIndex(c)); } QString KCharSelectData::section(uint c) { return sectionName(sectionIndex(blockIndex(c))); } QString KCharSelectData::name(uint c) { if (!openDataFile()) { return QString(); } if ((c & 0xFFFE) == 0xFFFE || (c >= 0xFDD0 && c <= 0xFDEF)) { return QCoreApplication::translate("KCharSelectData", ""); } else if ((c >= 0x3400 && c <= 0x4DBF) || (c >= 0x4E00 && c <= 0x9FFF) || (c >= 0x20000 && c <= 0x2F7FF)) { return QStringLiteral("CJK UNIFIED IDEOGRAPH-") + formatCode(c, 4, QString()); } else if (c >= 0xAC00 && c <= 0xD7AF) { /* compute hangul syllable name as per UAX #15 */ int SIndex = c - SBase; int LIndex, VIndex, TIndex; if (SIndex < 0 || SIndex >= SCount) { return QString(); } LIndex = SIndex / NCount; VIndex = (SIndex % NCount) / TCount; TIndex = SIndex % TCount; return QLatin1String("HANGUL SYLLABLE ") + QLatin1String(JAMO_L_TABLE[LIndex]) + QLatin1String(JAMO_V_TABLE[VIndex]) + QLatin1String(JAMO_T_TABLE[TIndex]); } else if (c >= 0xD800 && c <= 0xDB7F) { return QCoreApplication::translate("KCharSelectData", ""); } else if (c >= 0xDB80 && c <= 0xDBFF) { return QCoreApplication::translate("KCharSelectData", ""); } else if (c >= 0xDC00 && c <= 0xDFFF) { return QCoreApplication::translate("KCharSelectData", ""); } else if ((c >= 0xE000 && c <= 0xF8FF) || c >= 0xF0000) { return QCoreApplication::translate("KCharSelectData", ""); } else if ((c >= 0xF900 && c <= 0xFAFF) || (c >= 0x2F800 && c <= 0x2FFFF)) { return QStringLiteral("CJK COMPATIBILITY IDEOGRAPH-") + formatCode(c, 4, QString()); } quint16 unicode = mapCodePointToDataBase(c); if (unicode == 0xFFFF) { return QStringLiteral("NON-BMP-CHARACTER-") + formatCode(c, 4, QString()); } else { const uchar *data = reinterpret_cast(dataFile.constData()); const quint32 offsetBegin = qFromLittleEndian(data + 4); const quint32 offsetEnd = qFromLittleEndian(data + 8); int min = 0; int mid; int max = ((offsetEnd - offsetBegin) / 6) - 1; QString s; while (max >= min) { mid = (min + max) / 2; const quint16 midUnicode = qFromLittleEndian(data + offsetBegin + mid * 6); if (unicode > midUnicode) { min = mid + 1; } else if (unicode < midUnicode) { max = mid - 1; } else { quint32 offset = qFromLittleEndian(data + offsetBegin + mid * 6 + 2); s = QString::fromUtf8(dataFile.constData() + offset + 1); break; } } if (s.isNull()) { return QCoreApplication::translate("KCharSelectData", ""); } else { return s; } } } int KCharSelectData::blockIndex(uint c) { if (!openDataFile()) { return 0; } const uchar *data = reinterpret_cast(dataFile.constData()); const quint32 offsetBegin = qFromLittleEndian(data + 20); const quint32 offsetEnd = qFromLittleEndian(data + 24); const quint16 unicode = mapCodePointToDataBase(c); if (unicode == 0xFFFF) { return 0; } int max = ((offsetEnd - offsetBegin) / 4) - 1; int i = 0; while (unicode > qFromLittleEndian(data + offsetBegin + i * 4 + 2) && i < max) { i++; } return i; } int KCharSelectData::sectionIndex(int block) { if (!openDataFile()) { return 0; } const uchar *data = reinterpret_cast(dataFile.constData()); const quint32 offsetBegin = qFromLittleEndian(data + 28); const quint32 offsetEnd = qFromLittleEndian(data + 32); int max = ((offsetEnd - offsetBegin) / 4) - 1; for (int i = 0; i <= max; i++) { if (qFromLittleEndian(data + offsetBegin + i * 4 + 2) == block) { return qFromLittleEndian(data + offsetBegin + i * 4); } } return 0; } QString KCharSelectData::blockName(int index) { if (!openDataFile()) { return QString(); } const uchar *udata = reinterpret_cast(dataFile.constData()); const quint32 stringBegin = qFromLittleEndian(udata + 16); const quint32 stringEnd = qFromLittleEndian(udata + 20); quint32 i = stringBegin; int currIndex = 0; const char *data = dataFile.constData(); while (i < stringEnd && currIndex < index) { i += qstrlen(data + i) + 1; currIndex++; } return QCoreApplication::translate("KCharSelectData", data + i, "KCharselect unicode block name"); } QString KCharSelectData::sectionName(int index) { if (!openDataFile()) { return QString(); } const uchar *udata = reinterpret_cast(dataFile.constData()); const quint32 stringBegin = qFromLittleEndian(udata + 24); const quint32 stringEnd = qFromLittleEndian(udata + 28); quint32 i = stringBegin; int currIndex = 0; const char *data = dataFile.constData(); while (i < stringEnd && currIndex < index) { i += qstrlen(data + i) + 1; currIndex++; } return QCoreApplication::translate("KCharSelectData", data + i, "KCharselect unicode section name"); } QStringList KCharSelectData::aliases(uint c) { if (!openDataFile()) { return QStringList(); } const uchar *udata = reinterpret_cast(dataFile.constData()); const int detailIndex = getDetailIndex(c); if (detailIndex == 0) { return QStringList(); } const quint8 count = * (quint8 *)(udata + detailIndex + 6); quint32 offset = qFromLittleEndian(udata + detailIndex + 2); QStringList aliases; const char *data = dataFile.constData(); for (int i = 0; i < count; i++) { aliases.append(QString::fromUtf8(data + offset)); offset += qstrlen(data + offset) + 1; } return aliases; } QStringList KCharSelectData::notes(uint c) { if (!openDataFile()) { return QStringList(); } const int detailIndex = getDetailIndex(c); if (detailIndex == 0) { return QStringList(); } const uchar *udata = reinterpret_cast(dataFile.constData()); const quint8 count = * (quint8 *)(udata + detailIndex + 11); quint32 offset = qFromLittleEndian(udata + detailIndex + 7); QStringList notes; const char *data = dataFile.constData(); for (int i = 0; i < count; i++) { notes.append(QString::fromUtf8(data + offset)); offset += qstrlen(data + offset) + 1; } return notes; } QVector KCharSelectData::seeAlso(uint c) { if (!openDataFile()) { return QVector(); } const int detailIndex = getDetailIndex(c); if (detailIndex == 0) { return QVector(); } const uchar *udata = reinterpret_cast(dataFile.constData()); const quint8 count = * (quint8 *)(udata + detailIndex + 26); quint32 offset = qFromLittleEndian(udata + detailIndex + 22); QVector seeAlso; for (int i = 0; i < count; i++) { seeAlso.append(mapDataBaseToCodePoint(qFromLittleEndian (udata + offset))); offset += 2; } return seeAlso; } QStringList KCharSelectData::equivalents(uint c) { if (!openDataFile()) { return QStringList(); } const int detailIndex = getDetailIndex(c); if (detailIndex == 0) { return QStringList(); } const uchar *udata = reinterpret_cast(dataFile.constData()); const quint8 count = * (quint8 *)(udata + detailIndex + 21); quint32 offset = qFromLittleEndian(udata + detailIndex + 17); QStringList equivalents; const char *data = dataFile.constData(); for (int i = 0; i < count; i++) { equivalents.append(QString::fromUtf8(data + offset)); offset += qstrlen(data + offset) + 1; } return equivalents; } QStringList KCharSelectData::approximateEquivalents(uint c) { if (!openDataFile()) { return QStringList(); } const int detailIndex = getDetailIndex(c); if (detailIndex == 0) { return QStringList(); } const uchar *udata = reinterpret_cast(dataFile.constData()); const quint8 count = * (quint8 *)(udata + detailIndex + 16); quint32 offset = qFromLittleEndian(udata + detailIndex + 12); QStringList approxEquivalents; const char *data = dataFile.constData(); for (int i = 0; i < count; i++) { approxEquivalents.append(QString::fromUtf8(data + offset)); offset += qstrlen(data + offset) + 1; } return approxEquivalents; } QVector KCharSelectData::decomposition(uint c) { // for now, only decompose Hangul Syllable into Hangul Jamo uint SIndex = c - SBase; if (SIndex >= SCount) { return QVector(); } uint L = LBase + SIndex / NCount; // Choseong uint V = VBase + (SIndex % NCount) / TCount; // Jungseong uint T = TBase + SIndex % TCount; // Jongsung QVector jamoList; jamoList.append(L); jamoList.append(V); if (T != TBase) { jamoList.append(T); } return jamoList; } QStringList KCharSelectData::unihanInfo(uint c) { if (!openDataFile()) { return QStringList(); } quint16 unicode = mapCodePointToDataBase(c); if (unicode == 0xFFFF) { return QStringList(); } const char *data = dataFile.constData(); const uchar *udata = reinterpret_cast(data); const quint32 offsetBegin = qFromLittleEndian(udata + 36); const quint32 offsetEnd = dataFile.size(); int min = 0; int mid; int max = ((offsetEnd - offsetBegin) / 30) - 1; while (max >= min) { mid = (min + max) / 2; const quint16 midUnicode = qFromLittleEndian(udata + offsetBegin + mid * 30); if (unicode > midUnicode) { min = mid + 1; } else if (unicode < midUnicode) { max = mid - 1; } else { QStringList res; for (int i = 0; i < 7; i++) { quint32 offset = qFromLittleEndian(udata + offsetBegin + mid * 30 + 2 + i * 4); if (offset != 0) { res.append(QString::fromUtf8(data + offset)); } else { res.append(QString()); } } return res; } } return QStringList(); } QChar::Category KCharSelectData::category(uint c) { if (!openDataFile()) { return QChar::category(c); } ushort unicode = mapCodePointToDataBase(c); if (unicode == 0xFFFF) { return QChar::category(c); } const uchar *data = reinterpret_cast(dataFile.constData()); const quint32 offsetBegin = qFromLittleEndian(data + 4); const quint32 offsetEnd = qFromLittleEndian(data + 8); int min = 0; int mid; int max = ((offsetEnd - offsetBegin) / 6) - 1; QString s; while (max >= min) { mid = (min + max) / 2; const quint16 midUnicode = qFromLittleEndian(data + offsetBegin + mid * 6); if (unicode > midUnicode) { min = mid + 1; } else if (unicode < midUnicode) { max = mid - 1; } else { quint32 offset = qFromLittleEndian(data + offsetBegin + mid * 6 + 2); uchar categoryCode = *(data + offset); Q_ASSERT(categoryCode > 0); categoryCode--; /* Qt5 changed QChar::Category enum to start from 0 instead of 1 See QtBase commit d17c76feee9eece4 */ return QChar::Category(categoryCode); } } return QChar::category(c); } bool KCharSelectData::isPrint(uint c) { QChar::Category cat = category(c); return !(cat == QChar::Other_Control || cat == QChar::Other_NotAssigned); } bool KCharSelectData::isDisplayable(uint c) { // Qt internally uses U+FDD0 and U+FDD1 to mark the beginning and the end of frames. // They should be seen as non-printable characters, as trying to display them leads // to a crash caused by a Qt "noBlockInString" assertion. if (c == 0xFDD0 || c == 0xFDD1) { return false; } return !isIgnorable(c) && isPrint(c); } bool KCharSelectData::isIgnorable(uint c) { /* * According to the Unicode standard, Default Ignorable Code Points * should be ignored unless explicitly supported. For example, U+202E * RIGHT-TO-LEFT-OVERRIDE ir printable according to Qt, but displaying * it gives the undesired effect of all text being turned RTL. We do not * have a way to "explicitly" support it, so we will treat it as * non-printable. * * There is a list of these on * http://unicode.org/Public/UNIDATA/DerivedCoreProperties.txt under the * property Default_Ignorable_Code_Point. */ //NOTE: not very nice to hardcode these here; is it worth it to modify // the binary data file to hold them? return c == 0x00AD || c == 0x034F || c == 0x115F || c == 0x1160 || c == 0x17B4 || c == 0x17B5 || (c >= 0x180B && c <= 0x180D) || (c >= 0x200B && c <= 0x200F) || (c >= 0x202A && c <= 0x202E) || (c >= 0x2060 && c <= 0x206F) || c == 0x3164 || (c >= 0xFE00 && c <= 0xFE0F) || c == 0xFEFF || c == 0xFFA0 || (c >= 0xFFF0 && c <= 0xFFF8); } bool KCharSelectData::isCombining(uint c) { return section(c) == QCoreApplication::translate("KCharSelectData", "Combining Diacritics", "KCharSelect section name"); //FIXME: this is an imperfect test. There are many combining characters // that are outside of this section. See Grapheme_Extend in // http://www.unicode.org/Public/UNIDATA/DerivedCoreProperties.txt } QString KCharSelectData::display(uint c, const QFont &font) { if (!isDisplayable(c)) { return QStringLiteral("") + QCoreApplication::translate("KCharSelectData", "Non-printable") + QStringLiteral(""); } else { QString s = QStringLiteral(""); if (isCombining(c)) { s += displayCombining(c); } else { s += QStringLiteral("&#") + QString::number(c) + QLatin1Char(';'); } s += QStringLiteral(""); return s; } } QString KCharSelectData::displayCombining(uint c) { /* * The purpose of this is to make it easier to see how a combining * character affects the text around it. * The initial plan was to use U+25CC DOTTED CIRCLE for this purpose, * as seen in pdfs from Unicode, but there seem to be a lot of alignment * problems with that. * * Eventually, it would be nice to determine whether the character * combines to the left or to the right, etc. */ QString s = QStringLiteral(" &#") + QString::number(c) + QStringLiteral("; ") + QStringLiteral(" (ab&#") + QString::number(c) + QStringLiteral(";c)"); return s; } QString KCharSelectData::categoryText(QChar::Category category) { switch (category) { case QChar::Other_Control: return QCoreApplication::translate("KCharSelectData", "Other, Control"); case QChar::Other_Format: return QCoreApplication::translate("KCharSelectData", "Other, Format"); case QChar::Other_NotAssigned: return QCoreApplication::translate("KCharSelectData", "Other, Not Assigned"); case QChar::Other_PrivateUse: return QCoreApplication::translate("KCharSelectData", "Other, Private Use"); case QChar::Other_Surrogate: return QCoreApplication::translate("KCharSelectData", "Other, Surrogate"); case QChar::Letter_Lowercase: return QCoreApplication::translate("KCharSelectData", "Letter, Lowercase"); case QChar::Letter_Modifier: return QCoreApplication::translate("KCharSelectData", "Letter, Modifier"); case QChar::Letter_Other: return QCoreApplication::translate("KCharSelectData", "Letter, Other"); case QChar::Letter_Titlecase: return QCoreApplication::translate("KCharSelectData", "Letter, Titlecase"); case QChar::Letter_Uppercase: return QCoreApplication::translate("KCharSelectData", "Letter, Uppercase"); case QChar::Mark_SpacingCombining: return QCoreApplication::translate("KCharSelectData", "Mark, Spacing Combining"); case QChar::Mark_Enclosing: return QCoreApplication::translate("KCharSelectData", "Mark, Enclosing"); case QChar::Mark_NonSpacing: return QCoreApplication::translate("KCharSelectData", "Mark, Non-Spacing"); case QChar::Number_DecimalDigit: return QCoreApplication::translate("KCharSelectData", "Number, Decimal Digit"); case QChar::Number_Letter: return QCoreApplication::translate("KCharSelectData", "Number, Letter"); case QChar::Number_Other: return QCoreApplication::translate("KCharSelectData", "Number, Other"); case QChar::Punctuation_Connector: return QCoreApplication::translate("KCharSelectData", "Punctuation, Connector"); case QChar::Punctuation_Dash: return QCoreApplication::translate("KCharSelectData", "Punctuation, Dash"); case QChar::Punctuation_Close: return QCoreApplication::translate("KCharSelectData", "Punctuation, Close"); case QChar::Punctuation_FinalQuote: return QCoreApplication::translate("KCharSelectData", "Punctuation, Final Quote"); case QChar::Punctuation_InitialQuote: return QCoreApplication::translate("KCharSelectData", "Punctuation, Initial Quote"); case QChar::Punctuation_Other: return QCoreApplication::translate("KCharSelectData", "Punctuation, Other"); case QChar::Punctuation_Open: return QCoreApplication::translate("KCharSelectData", "Punctuation, Open"); case QChar::Symbol_Currency: return QCoreApplication::translate("KCharSelectData", "Symbol, Currency"); case QChar::Symbol_Modifier: return QCoreApplication::translate("KCharSelectData", "Symbol, Modifier"); case QChar::Symbol_Math: return QCoreApplication::translate("KCharSelectData", "Symbol, Math"); case QChar::Symbol_Other: return QCoreApplication::translate("KCharSelectData", "Symbol, Other"); case QChar::Separator_Line: return QCoreApplication::translate("KCharSelectData", "Separator, Line"); case QChar::Separator_Paragraph: return QCoreApplication::translate("KCharSelectData", "Separator, Paragraph"); case QChar::Separator_Space: return QCoreApplication::translate("KCharSelectData", "Separator, Space"); default: return QCoreApplication::translate("KCharSelectData", "Unknown"); } } QVector KCharSelectData::find(const QString &needle) { QSet result; QVector returnRes; QString simplified = needle.simplified(); QStringList searchStrings; QRegularExpression octalExp(QStringLiteral("^\\\\[0-7][0-7\\\\]*$")); QRegularExpressionMatch match = octalExp.match(simplified); if (match.hasMatch()) { // search for C octal escaped UTF-8 QByteArray utf8; int byte = -1; for (int i = 0; i <= simplified.length(); ++i) { int c = simplified.at(i).unicode(); if (c >= '0' && c <= '7') { byte = 8 * byte + c - '0'; } else if (byte == -1) { byte = 0; } else if (byte >= 0x00 && byte <= 0xFF) { utf8.append((char) byte); byte = 0; } } simplified = QString::fromUtf8(utf8); } if (simplified.length() <= 2) { QVector ucs4 = simplified.toUcs4(); if (ucs4.size() == 1) { // search for hex representation of the character searchStrings = QStringList(formatCode(ucs4.at(0))); } } else { searchStrings = splitString(simplified); } if (searchStrings.count() == 0) { return returnRes; } QRegularExpression hexExp(QStringLiteral("^(|u\\+|U\\+|0x|0X)([A-Fa-f0-9]{4,5})$")); foreach (const QString &s, searchStrings) { QRegularExpressionMatch match = hexExp.match(s); if (match.hasMatch()) { returnRes.append(match.captured(2).toInt(nullptr, 16)); // search for "1234" instead of "0x1234" if (s.length() == 6 || s.length() == 7) { searchStrings[searchStrings.indexOf(s)] = match.captured(2); } } // try to parse string as decimal number bool ok; int unicode = s.toInt(&ok); if (ok && unicode >= 0 && unicode <= QChar::LastValidCodePoint) { returnRes.append(unicode); } } bool firstSubString = true; foreach (const QString &s, searchStrings) { QSet partResult = getMatchingChars(s.toLower()); if (firstSubString) { result = partResult; firstSubString = false; } else { result = result.intersect(partResult); } } // remove results found by matching the code point to prevent duplicate results // while letting these characters stay at the beginning foreach (uint c, returnRes) { result.remove(c); } QVector sortedResult; sortedResult.reserve(result.count()); QSet::const_iterator it = result.begin(); const QSet::const_iterator end = result.end(); for ( ; it != end ; ++it ) { sortedResult.append(*it); } qSort(sortedResult); returnRes += sortedResult; return returnRes; } QSet KCharSelectData::getMatchingChars(const QString &s) { if (dataFile.isEmpty()) { return QSet(); } futureIndex.waitForFinished(); const Index index = futureIndex; Index::const_iterator pos = index.lowerBound(s); QSet result; while (pos != index.constEnd() && pos.key().startsWith(s)) { foreach (quint16 c, pos.value()) { result.insert(mapDataBaseToCodePoint(c)); } ++pos; } return result; } QStringList KCharSelectData::splitString(const QString &s) { QStringList result; int start = 0; int end = 0; int length = s.length(); while (end < length) { while (end < length && (s[end].isLetterOrNumber() || s[end] == QLatin1Char('+'))) { end++; } if (start != end) { result.append(s.mid(start, end - start)); } start = end; while (end < length && !(s[end].isLetterOrNumber() || s[end] == QLatin1Char('+'))) { end++; start++; } } return result; } void KCharSelectData::appendToIndex(Index *index, quint16 unicode, const QString &s) { const QStringList strings = splitString(s); foreach (const QString &s, strings) { (*index)[s.toLower()].append(unicode); } } Index KCharSelectData::createIndex(const QByteArray &dataFile) { Index i; // character names const uchar *udata = reinterpret_cast(dataFile.constData()); const char *data = dataFile.constData(); const quint32 nameOffsetBegin = qFromLittleEndian(udata + 4); const quint32 nameOffsetEnd = qFromLittleEndian(udata + 8); int max = ((nameOffsetEnd - nameOffsetBegin) / 6) - 1; for (int pos = 0; pos <= max; pos++) { const quint16 unicode = qFromLittleEndian(udata + nameOffsetBegin + pos * 6); quint32 offset = qFromLittleEndian(udata + nameOffsetBegin + pos * 6 + 2); appendToIndex(&i, unicode, QString::fromUtf8(data + offset + 1)); } // details const quint32 detailsOffsetBegin = qFromLittleEndian(udata + 12); const quint32 detailsOffsetEnd = qFromLittleEndian(udata + 16); max = ((detailsOffsetEnd - detailsOffsetBegin) / 27) - 1; for (int pos = 0; pos <= max; pos++) { const quint16 unicode = qFromLittleEndian(udata + detailsOffsetBegin + pos * 27); // aliases const quint8 aliasCount = * (quint8 *)(udata + detailsOffsetBegin + pos * 27 + 6); quint32 aliasOffset = qFromLittleEndian(udata + detailsOffsetBegin + pos * 27 + 2); for (int j = 0; j < aliasCount; j++) { appendToIndex(&i, unicode, QString::fromUtf8(data + aliasOffset)); aliasOffset += qstrlen(data + aliasOffset) + 1; } // notes const quint8 notesCount = * (quint8 *)(udata + detailsOffsetBegin + pos * 27 + 11); quint32 notesOffset = qFromLittleEndian(udata + detailsOffsetBegin + pos * 27 + 7); for (int j = 0; j < notesCount; j++) { appendToIndex(&i, unicode, QString::fromUtf8(data + notesOffset)); notesOffset += qstrlen(data + notesOffset) + 1; } // approximate equivalents const quint8 apprCount = * (quint8 *)(udata + detailsOffsetBegin + pos * 27 + 16); quint32 apprOffset = qFromLittleEndian(udata + detailsOffsetBegin + pos * 27 + 12); for (int j = 0; j < apprCount; j++) { appendToIndex(&i, unicode, QString::fromUtf8(data + apprOffset)); apprOffset += qstrlen(data + apprOffset) + 1; } // equivalents const quint8 equivCount = * (quint8 *)(udata + detailsOffsetBegin + pos * 27 + 21); quint32 equivOffset = qFromLittleEndian(udata + detailsOffsetBegin + pos * 27 + 17); for (int j = 0; j < equivCount; j++) { appendToIndex(&i, unicode, QString::fromUtf8(data + equivOffset)); equivOffset += qstrlen(data + equivOffset) + 1; } // see also - convert to string (hex) const quint8 seeAlsoCount = * (quint8 *)(udata + detailsOffsetBegin + pos * 27 + 26); quint32 seeAlsoOffset = qFromLittleEndian(udata + detailsOffsetBegin + pos * 27 + 22); for (int j = 0; j < seeAlsoCount; j++) { quint16 seeAlso = qFromLittleEndian (udata + seeAlsoOffset); appendToIndex(&i, unicode, formatCode(seeAlso, 4, QString())); equivOffset += qstrlen(data + equivOffset) + 1; } } // unihan data // temporary disabled due to the huge amount of data // const quint32 unihanOffsetBegin = qFromLittleEndian(udata+36); // const quint32 unihanOffsetEnd = dataFile.size(); // max = ((unihanOffsetEnd - unihanOffsetBegin) / 30) - 1; // // for (int pos = 0; pos <= max; pos++) { // const quint16 unicode = qFromLittleEndian(udata + unihanOffsetBegin + pos*30); // for(int j = 0; j < 7; j++) { // quint32 offset = qFromLittleEndian(udata + unihanOffsetBegin + pos*30 + 2 + j*4); // if(offset != 0) { // appendToIndex(&i, unicode, QString::fromUtf8(data + offset)); // } // } // } return i; }