75 lines
		
	
	
		
			2.0 KiB
		
	
	
	
		
			Python
		
	
	
		
			Executable File
		
	
	
			
		
		
	
	
			75 lines
		
	
	
		
			2.0 KiB
		
	
	
	
		
			Python
		
	
	
		
			Executable File
		
	
	
| #!/usr/bin/env python
 | |
| 
 | |
| # This does simple normalized frequency analysis on UTF-8 encoded text. The
 | |
| # result of the analysis is translated to a ranked list, where every byte is
 | |
| # assigned a rank. This list is written to src/freqs.rs.
 | |
| #
 | |
| # Currently, the frequencies are generated from the following corpuses:
 | |
| #
 | |
| #   * The CIA world fact book
 | |
| #   * The source code of rustc
 | |
| #   * Septuaginta
 | |
| 
 | |
| from __future__ import absolute_import, division, print_function
 | |
| 
 | |
| import argparse
 | |
| from collections import Counter
 | |
| import sys
 | |
| 
 | |
| preamble = '''
 | |
| // NOTE: The following code was generated by "scripts/frequencies.py", do not
 | |
| // edit directly
 | |
| '''.lstrip()
 | |
| 
 | |
| 
 | |
| def eprint(*args, **kwargs):
 | |
|     kwargs['file'] = sys.stderr
 | |
|     print(*args, **kwargs)
 | |
| 
 | |
| 
 | |
| def main():
 | |
|     p = argparse.ArgumentParser()
 | |
|     p.add_argument('corpus', metavar='FILE', nargs='+')
 | |
|     args = p.parse_args()
 | |
| 
 | |
|     # Get frequency counts of each byte.
 | |
|     freqs = Counter()
 | |
|     for i in range(0, 256):
 | |
|         freqs[i] = 0
 | |
| 
 | |
|     eprint('reading entire corpus into memory')
 | |
|     corpus = []
 | |
|     for fpath in args.corpus:
 | |
|         corpus.append(open(fpath, 'rb').read())
 | |
| 
 | |
|     eprint('computing byte frequencies')
 | |
|     for c in corpus:
 | |
|         for byte in c:
 | |
|             freqs[byte] += 1.0 / float(len(c))
 | |
| 
 | |
|     eprint('writing Rust code')
 | |
|     # Get the rank of each byte. A lower rank => lower relative frequency.
 | |
|     rank = [0] * 256
 | |
|     for i, (byte, _) in enumerate(freqs.most_common()):
 | |
|         # print(byte)
 | |
|         rank[byte] = 255 - i
 | |
| 
 | |
|     # Forcefully set the highest rank possible for bytes that start multi-byte
 | |
|     # UTF-8 sequences. The idea here is that a continuation byte will be more
 | |
|     # discerning in a homogenous haystack.
 | |
|     for byte in range(0xC0, 0xFF + 1):
 | |
|         rank[byte] = 255
 | |
| 
 | |
|     # Now write Rust.
 | |
|     olines = ['pub const BYTE_FREQUENCIES: [u8; 256] = [']
 | |
|     for byte in range(256):
 | |
|         olines.append('    %3d, // %r' % (rank[byte], chr(byte)))
 | |
|     olines.append('];')
 | |
| 
 | |
|     print(preamble)
 | |
|     print('\n'.join(olines))
 | |
| 
 | |
| 
 | |
| if __name__ == '__main__':
 | |
|     main()
 |