150 lines
		
	
	
		
			3.5 KiB
		
	
	
	
		
			Bash
		
	
	
		
			Executable File
		
	
	
			
		
		
	
	
			150 lines
		
	
	
		
			3.5 KiB
		
	
	
	
		
			Bash
		
	
	
		
			Executable File
		
	
	
| #!/bin/sh
 | |
| 
 | |
| set -e
 | |
| D="$(dirname "$0")"
 | |
| 
 | |
| # Convenience function for checking that a command exists.
 | |
| requires() {
 | |
|     cmd="$1"
 | |
|     if ! command -v "$cmd" > /dev/null 2>&1; then
 | |
|         echo "DEPENDENCY MISSING: $cmd must be installed" >&2
 | |
|         exit 1
 | |
|     fi
 | |
| }
 | |
| 
 | |
| # Test if an array ($2) contains a particular element ($1).
 | |
| array_exists() {
 | |
|     needle="$1"
 | |
|     shift
 | |
| 
 | |
|     for el in "$@"; do
 | |
|         if [ "$el" = "$needle" ]; then
 | |
|             return 0
 | |
|         fi
 | |
|     done
 | |
|     return 1
 | |
| }
 | |
| 
 | |
| graphemes() {
 | |
|     regex="$(sh "$D/regex/grapheme.sh")"
 | |
| 
 | |
|     echo "generating forward grapheme DFA"
 | |
|     ucd-generate dfa \
 | |
|         --name GRAPHEME_BREAK_FWD \
 | |
|         --sparse --minimize --anchored --state-size 2 \
 | |
|         src/unicode/fsm/ \
 | |
|         "$regex"
 | |
| 
 | |
|     echo "generating reverse grapheme DFA"
 | |
|     ucd-generate dfa \
 | |
|         --name GRAPHEME_BREAK_REV \
 | |
|         --reverse --longest \
 | |
|         --sparse --minimize --anchored --state-size 2 \
 | |
|         src/unicode/fsm/ \
 | |
|         "$regex"
 | |
| }
 | |
| 
 | |
| words() {
 | |
|     regex="$(sh "$D/regex/word.sh")"
 | |
| 
 | |
|     echo "generating forward word DFA (this can take a while)"
 | |
|     ucd-generate dfa \
 | |
|         --name WORD_BREAK_FWD \
 | |
|         --sparse --minimize --anchored --state-size 4 \
 | |
|         src/unicode/fsm/ \
 | |
|         "$regex"
 | |
| }
 | |
| 
 | |
| sentences() {
 | |
|     regex="$(sh "$D/regex/sentence.sh")"
 | |
| 
 | |
|     echo "generating forward sentence DFA (this can take a while)"
 | |
|     ucd-generate dfa \
 | |
|         --name SENTENCE_BREAK_FWD \
 | |
|         --minimize \
 | |
|         --sparse --anchored --state-size 4 \
 | |
|         src/unicode/fsm/ \
 | |
|         "$regex"
 | |
| }
 | |
| 
 | |
| regional_indicator() {
 | |
|     # For finding all occurrences of region indicators. This is used to handle
 | |
|     # regional indicators as a special case for the reverse grapheme iterator
 | |
|     # and the reverse word iterator.
 | |
|     echo "generating regional indicator DFA"
 | |
|     ucd-generate dfa \
 | |
|         --name REGIONAL_INDICATOR_REV \
 | |
|         --reverse \
 | |
|         --classes --minimize --anchored --premultiply --state-size 1 \
 | |
|         src/unicode/fsm/ \
 | |
|         "\p{gcb=Regional_Indicator}"
 | |
| }
 | |
| 
 | |
| simple_word() {
 | |
|     echo "generating forward simple word DFA"
 | |
|     ucd-generate dfa \
 | |
|         --name SIMPLE_WORD_FWD \
 | |
|         --sparse --minimize --state-size 2 \
 | |
|         src/unicode/fsm/ \
 | |
|         "\w"
 | |
| }
 | |
| 
 | |
| whitespace() {
 | |
|     echo "generating forward whitespace DFA"
 | |
|     ucd-generate dfa \
 | |
|         --name WHITESPACE_ANCHORED_FWD \
 | |
|         --anchored --classes --premultiply --minimize --state-size 1 \
 | |
|         src/unicode/fsm/ \
 | |
|         "\s+"
 | |
| 
 | |
|     echo "generating reverse whitespace DFA"
 | |
|     ucd-generate dfa \
 | |
|         --name WHITESPACE_ANCHORED_REV \
 | |
|         --reverse \
 | |
|         --anchored --classes --premultiply --minimize --state-size 2 \
 | |
|         src/unicode/fsm/ \
 | |
|         "\s+"
 | |
| }
 | |
| 
 | |
| main() {
 | |
|     if array_exists "-h" "$@" || array_exists "--help" "$@"; then
 | |
|         echo "Usage: $(basename "$0") [--list-commands] [<command>] ..." >&2
 | |
|         exit
 | |
|     fi
 | |
| 
 | |
|     commands="
 | |
|         graphemes
 | |
|         sentences
 | |
|         words
 | |
|         regional-indicator
 | |
|         simple-word
 | |
|         whitespace
 | |
|     "
 | |
|     if array_exists "--list-commands" "$@"; then
 | |
|         for cmd in $commands; do
 | |
|             echo "$cmd"
 | |
|         done
 | |
|         exit
 | |
|     fi
 | |
| 
 | |
|     # ucd-generate is used to compile regexes into DFAs.
 | |
|     requires ucd-generate
 | |
| 
 | |
|     mkdir -p src/unicode/fsm/
 | |
| 
 | |
|     cmds=$*
 | |
|     if [ $# -eq 0 ] || array_exists "all" "$@"; then
 | |
|         cmds=$commands
 | |
|     fi
 | |
|     for cmd in $cmds; do
 | |
|         if array_exists "$cmd" $commands; then
 | |
|             fun="$(echo "$cmd" | sed 's/-/_/g')"
 | |
|             eval "$fun"
 | |
|         else
 | |
|             echo "unrecognized command: $cmd" >&2
 | |
|         fi
 | |
|     done
 | |
| }
 | |
| 
 | |
| main "$@"
 |