codepoints = {} IO.readlines('UnicodeData.txt').each do |l| a = l.chomp.split ';' codepoints[a.first] = a.drop 1 end ((0x0000..0xD7FF).to_a+(0xE000..0x10FFFF).to_a).each do |i| c = i.chr 'UTF-8' if c =~ /\p{Age=12.0}/ and c !~ /\p{Age=11.0}/ catA = c =~ /\p{Ll}|\p{Lu}|\p{Lo}|\p{Nd}|\p{Lm}|\p{Mn}|\p{Mc}/ # LetterDigits (A) catB = c != c.unicode_normalize(:nfkc).downcase(:fold).unicode_normalize(:nfkc) # Unstable (B) catC = c =~ /p{Default_Ignorable_Code_Point}|p{White_Space}|p{Noncharacter_Code_Point}/ # IgnorableProperties (C) catD = c =~ /p{In_Combining_Diacritical_Marks_for_Symbols}|p{In_Musical_Symbols}|p{In_Ancient_Greek_Musical_Notation}/ # IgnorableBlocks (D) # LDH (E): This is fixed and so can be ignored # Exceptions (F): There are no exceptions in new codepoints yet, so this can be ignored # BackwardCompatible (G): Empty, so ignorable catH = c =~ /p{Join_Control}/ # JoinControl (H) catI = c =~ /p{Grapheme_Cluster_Break=L}|\p{Grapheme_Cluster_Break=V}|\p{Grapheme_Cluster_Break=T}/ # OldHangulJamo (I), not completely sure Grapheme_Cluster_Break is right, but should be # Unassigned (J): not possible if we check for Age s = i.to_s(16).upcase.rjust 4, '0' print s print ' ', if catH then 'CONTEXTJ' elsif catB or catC or catD or catI then 'DISALLAWED' elsif catA then 'PVALID' else 'DISALLOWED' end v = codepoints[s] print ' ', v.first if v puts end end