You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

131 lines
4.8 KiB
Python

# Copyright 2015 Google Inc. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
# get ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt
# Usage: python tools/mk_puncttable.py UnicodeData.txt > src/puncttable.rs
import sys
def get_bits(high, punct):
b = 0
for i in range(16):
if high * 16 + i in punct:
b |= 1 << i
return b
def main(args):
ascii_punct = "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~"
ascii_set = set((ord(c) for c in ascii_punct))
punct = set()
for line in file(args[1]):
spl = line.split(';')
if spl[2] in ('Pc', 'Pd', 'Pe', 'Pf', 'Pi', 'Po', 'Ps'):
punct.add(int(spl[0], 16))
pshift = list(set((cp // 16 for cp in punct if cp >= 128)))
pshift.sort()
bits = [get_bits(high, punct) for high in pshift]
print """// Copyright 2015 Google Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
//! CommonMark punctuation set based on spec and Unicode properties.
// Autogenerated by mk_puncttable.py
const PUNCT_MASKS_ASCII: [u16; 8] = ["""
for x in range(8):
y = get_bits(x, ascii_set)
print ' 0x%04x, // U+%04X...U+%04X' % (y, x * 16, x * 16 + 15)
print """ ];
const PUNCT_TAB: [u16; %i] = [""" % len(pshift)
for x in pshift:
print ' %d, // U+%04X...U+%04X' % (x, x * 16, x * 16 + 15)
print """ ];
const PUNCT_MASKS: [u16; %i] = [""" % len(pshift)
for i, y in enumerate(bits):
x = pshift[i]
print ' 0x%04x, // U+%04X...U+%04X' % (y, x * 16, x * 16 + 15)
print """ ];
pub fn is_ascii_punctuation(c: u8) -> bool {
c < 128 && (PUNCT_MASKS_ASCII[(c / 16) as usize] & (1 << (c & 15))) != 0
}
pub fn is_punctuation(c: char) -> bool {
let cp = c as u32;
if cp < 128 {return is_ascii_punctuation(cp as u8); }
if cp > 0x%04X { return false; }
let high = (cp / 16) as u16;
match PUNCT_TAB.binary_search(&high) {
Ok(index) => (PUNCT_MASKS[index] & (1 << (cp & 15))) != 0,
_ => false
}
}
#[cfg(test)]
mod tests {
use super::{is_ascii_punctuation, is_punctuation};
#[test]
fn test_ascii() {
assert!(is_ascii_punctuation(b'!'));
assert!(is_ascii_punctuation(b'@'));
assert!(is_ascii_punctuation(b'~'));
assert!(!is_ascii_punctuation(b' '));
assert!(!is_ascii_punctuation(b'0'));
assert!(!is_ascii_punctuation(b'A'));
assert!(!is_ascii_punctuation(0xA1));
}
#[test]
fn test_unicode() {
assert!(is_punctuation('~'));
assert!(!is_punctuation(' '));
assert!(is_punctuation('\u{00A1}'));
assert!(is_punctuation('\u{060C}'));
assert!(is_punctuation('\u{FF65}'));
assert!(is_punctuation('\u{1BC9F}'));
assert!(!is_punctuation('\u{1BCA0}'));
}
}
""" % max(punct)
main(sys.argv)