777 lines
		
	
	
		
			24 KiB
		
	
	
	
		
			C++
		
	
	
	
			
		
		
	
	
			777 lines
		
	
	
		
			24 KiB
		
	
	
	
		
			C++
		
	
	
	
/***********************************************************************
 | 
						|
 * © 2016 and later: Unicode, Inc. and others.
 | 
						|
 * License & terms of use: http://www.unicode.org/copyright.html
 | 
						|
 *
 | 
						|
 ***********************************************************************
 | 
						|
 ***********************************************************************
 | 
						|
 * COPYRIGHT:
 | 
						|
 * Copyright (C) 2001-2012 IBM, Inc.   All Rights Reserved.
 | 
						|
 *
 | 
						|
 ***********************************************************************/
 | 
						|
/********************************************************************************
 | 
						|
*
 | 
						|
* File ubrkperf.cpp
 | 
						|
*
 | 
						|
* Modification History:
 | 
						|
*        Name                     Description
 | 
						|
*     Vladimir Weinstein          First Version, based on collperf
 | 
						|
*
 | 
						|
*********************************************************************************
 | 
						|
*/
 | 
						|
 | 
						|
//
 | 
						|
//  This program tests break iterator performance
 | 
						|
//      Currently we test only ICU APIs with the future possibility of testing *nix & win32 APIs
 | 
						|
//      (if any)
 | 
						|
//      A text file is required as input.  It must be in utf-8 or utf-16 format,
 | 
						|
//      and include a byte order mark.  Either LE or BE format is OK.
 | 
						|
//
 | 
						|
 | 
						|
const char gUsageString[] =
 | 
						|
 "usage:  ubrkperf options...\n"
 | 
						|
    "-help                      Display this message.\n"
 | 
						|
    "-file file_name            utf-16/utf-8 format file.\n"
 | 
						|
    "-locale name               ICU locale to use.  Default is en_US\n"
 | 
						|
    "-langid 0x1234             Windows Language ID number.  Default to value for -locale option\n"
 | 
						|
    "                              see http://msdn.microsoft.com/library/psdk/winbase/nls_8xo3.htm\n"
 | 
						|
    "-win                       Run test using Windows native services. (currently not working) (ICU is default)\n"
 | 
						|
    "-unix                      Run test using Unix word breaking services. (currently not working) \n"
 | 
						|
    "-mac                       Run test using MacOSX word breaking services.\n"
 | 
						|
    "-uselen                    Use API with string lengths.  Default is null-terminated strings\n"
 | 
						|
    "-char                      Use character break iterator\n"
 | 
						|
    "-word                      Use word break iterator\n"
 | 
						|
    "-line                      Use line break iterator\n"
 | 
						|
    "-sentence                  Use sentence break iterator\n"
 | 
						|
    "-loop nnnn                 Loopcount for test.  Adjust for reasonable total running time.\n"
 | 
						|
    "-iloop n                   Inner Loop Count.  Default = 1.  Number of calls to function\n"
 | 
						|
    "                               under test at each call point.  For measuring test overhead.\n"
 | 
						|
    "-terse                     Terse numbers-only output.  Intended for use by scripts.\n"
 | 
						|
    "-dump                      Display stuff.\n"
 | 
						|
    "-capi                      Use C APIs instead of C++ APIs (currently not working)\n"
 | 
						|
    "-next                      Do the next test\n"
 | 
						|
    "-isBound                   Do the isBound test\n"
 | 
						|
    ;
 | 
						|
 | 
						|
 | 
						|
#include <stdio.h>
 | 
						|
#include <string.h>
 | 
						|
#include <stdlib.h>
 | 
						|
#include <math.h>
 | 
						|
#include <locale.h>
 | 
						|
#include <errno.h>
 | 
						|
#include <sys/stat.h>
 | 
						|
 | 
						|
#include <unicode/utypes.h>
 | 
						|
#include <unicode/ucol.h>
 | 
						|
#include <unicode/ucoleitr.h>
 | 
						|
#include <unicode/uloc.h>
 | 
						|
#include <unicode/ustring.h>
 | 
						|
#include <unicode/ures.h>
 | 
						|
#include <unicode/uchar.h>
 | 
						|
#include <unicode/ucnv.h>
 | 
						|
#include <unicode/utf8.h>
 | 
						|
 | 
						|
#include <unicode/brkiter.h>
 | 
						|
 | 
						|
 | 
						|
#if U_PLATFORM_HAS_WIN32_API
 | 
						|
#include <windows.h>
 | 
						|
#else
 | 
						|
//
 | 
						|
//  Stubs for Windows API functions when building on UNIXes.
 | 
						|
//
 | 
						|
#include <sys/time.h>
 | 
						|
unsigned long timeGetTime() {
 | 
						|
    struct timeval t;
 | 
						|
    gettimeofday(&t, 0);
 | 
						|
    unsigned long val = t.tv_sec * 1000;  // Let it overflow.  Who cares.
 | 
						|
    val += t.tv_usec / 1000;
 | 
						|
    return val;
 | 
						|
};
 | 
						|
#define MAKELCID(a,b) 0
 | 
						|
#endif
 | 
						|
 | 
						|
 | 
						|
//
 | 
						|
//  Command line option variables
 | 
						|
//     These global variables are set according to the options specified
 | 
						|
//     on the command line by the user.
 | 
						|
char * opt_fName      = 0;
 | 
						|
char * opt_locale     = "en_US";
 | 
						|
int    opt_langid     = 0;         // Defaults to value corresponding to opt_locale.
 | 
						|
char * opt_rules      = 0;
 | 
						|
UBool  opt_help       = FALSE;
 | 
						|
int    opt_time       = 0;
 | 
						|
int    opt_loopCount  = 0;
 | 
						|
int    opt_passesCount= 1;
 | 
						|
UBool  opt_terse      = FALSE;
 | 
						|
UBool  opt_icu        = TRUE;
 | 
						|
UBool  opt_win        = FALSE;      // Run with Windows native functions.
 | 
						|
UBool  opt_unix       = FALSE;      // Run with UNIX strcoll, strxfrm functions.
 | 
						|
UBool  opt_mac        = FALSE;      // Run with MacOSX word break services.
 | 
						|
UBool  opt_uselen     = FALSE;
 | 
						|
UBool  opt_dump       = FALSE;
 | 
						|
UBool  opt_char       = FALSE;
 | 
						|
UBool  opt_word       = FALSE;
 | 
						|
UBool  opt_line       = FALSE;
 | 
						|
UBool  opt_sentence   = FALSE;
 | 
						|
UBool  opt_capi       = FALSE;
 | 
						|
 | 
						|
UBool  opt_next       = FALSE;
 | 
						|
UBool  opt_isBound    = FALSE;
 | 
						|
 | 
						|
 | 
						|
 | 
						|
//
 | 
						|
//   Definitions for the command line options
 | 
						|
//
 | 
						|
struct OptSpec {
 | 
						|
    const char *name;
 | 
						|
    enum {FLAG, NUM, STRING} type;
 | 
						|
    void *pVar;
 | 
						|
};
 | 
						|
 | 
						|
OptSpec opts[] = {
 | 
						|
    {"-file",        OptSpec::STRING, &opt_fName},
 | 
						|
    {"-locale",      OptSpec::STRING, &opt_locale},
 | 
						|
    {"-langid",      OptSpec::NUM,    &opt_langid},
 | 
						|
    {"-win",         OptSpec::FLAG,   &opt_win},
 | 
						|
    {"-unix",        OptSpec::FLAG,   &opt_unix},
 | 
						|
    {"-mac",         OptSpec::FLAG,   &opt_mac},
 | 
						|
    {"-uselen",      OptSpec::FLAG,   &opt_uselen},
 | 
						|
    {"-loop",        OptSpec::NUM,    &opt_loopCount},
 | 
						|
    {"-time",        OptSpec::NUM,    &opt_time},
 | 
						|
    {"-passes",      OptSpec::NUM,    &opt_passesCount},
 | 
						|
    {"-char",        OptSpec::FLAG,   &opt_char},
 | 
						|
    {"-word",        OptSpec::FLAG,   &opt_word},
 | 
						|
    {"-line",        OptSpec::FLAG,   &opt_line},
 | 
						|
    {"-sentence",    OptSpec::FLAG,   &opt_sentence},
 | 
						|
    {"-terse",       OptSpec::FLAG,   &opt_terse},
 | 
						|
    {"-dump",        OptSpec::FLAG,   &opt_dump},
 | 
						|
    {"-capi",        OptSpec::FLAG,   &opt_capi},
 | 
						|
    {"-next",        OptSpec::FLAG,   &opt_next},
 | 
						|
    {"-isBound",     OptSpec::FLAG,   &opt_isBound},
 | 
						|
    {"-help",        OptSpec::FLAG,   &opt_help},
 | 
						|
    {"-?",           OptSpec::FLAG,   &opt_help},
 | 
						|
    {0, OptSpec::FLAG, 0}
 | 
						|
};
 | 
						|
 | 
						|
 | 
						|
//---------------------------------------------------------------------------
 | 
						|
//
 | 
						|
//  Global variables pointing to and describing the test file
 | 
						|
//
 | 
						|
//---------------------------------------------------------------------------
 | 
						|
 | 
						|
//DWORD          gWinLCID;
 | 
						|
BreakIterator *brkit = NULL;
 | 
						|
UChar *text = NULL;
 | 
						|
int32_t textSize = 0;
 | 
						|
 | 
						|
 | 
						|
 | 
						|
#if U_PLATFORM_IS_DARWIN_BASED
 | 
						|
#include <ApplicationServices/ApplicationServices.h>
 | 
						|
enum{
 | 
						|
  kUCTextBreakAllMask = (kUCTextBreakClusterMask | kUCTextBreakWordMask | kUCTextBreakLineMask)
 | 
						|
    };
 | 
						|
UCTextBreakType breakTypes[4] = {kUCTextBreakCharMask, kUCTextBreakClusterMask, kUCTextBreakWordMask, kUCTextBreakLineMask};
 | 
						|
TextBreakLocatorRef breakRef;
 | 
						|
UCTextBreakType macBreakType;
 | 
						|
 | 
						|
void createMACBrkIt() {
 | 
						|
  OSStatus status = noErr;
 | 
						|
  LocaleRef lref;
 | 
						|
  status = LocaleRefFromLocaleString(opt_locale, &lref);
 | 
						|
  status = UCCreateTextBreakLocator(lref, 0, kUCTextBreakAllMask, (TextBreakLocatorRef*)&breakRef);
 | 
						|
  if(opt_char == TRUE) {
 | 
						|
    macBreakType = kUCTextBreakClusterMask;
 | 
						|
  } else if(opt_word == TRUE) {
 | 
						|
    macBreakType = kUCTextBreakWordMask;
 | 
						|
  } else if(opt_line == TRUE) {
 | 
						|
    macBreakType = kUCTextBreakLineMask;
 | 
						|
  } else if(opt_sentence == TRUE) {
 | 
						|
    // error
 | 
						|
    // brkit = BreakIterator::createSentenceInstance(opt_locale, status);
 | 
						|
  } else {
 | 
						|
    // default is character iterator
 | 
						|
    macBreakType = kUCTextBreakClusterMask;
 | 
						|
      }
 | 
						|
}
 | 
						|
#endif
 | 
						|
 | 
						|
void createICUBrkIt() {
 | 
						|
  //
 | 
						|
  //  Set up an ICU break iterator
 | 
						|
  //
 | 
						|
  UErrorCode          status = U_ZERO_ERROR;
 | 
						|
  if(opt_char == TRUE) {
 | 
						|
    brkit = BreakIterator::createCharacterInstance(opt_locale, status);
 | 
						|
  } else if(opt_word == TRUE) {
 | 
						|
    brkit = BreakIterator::createWordInstance(opt_locale, status);
 | 
						|
  } else if(opt_line == TRUE) {
 | 
						|
    brkit = BreakIterator::createLineInstance(opt_locale, status);
 | 
						|
  } else if(opt_sentence == TRUE) {
 | 
						|
    brkit = BreakIterator::createSentenceInstance(opt_locale, status);
 | 
						|
  } else {
 | 
						|
    // default is character iterator
 | 
						|
    brkit = BreakIterator::createCharacterInstance(opt_locale, status);
 | 
						|
  }
 | 
						|
  if (status==U_USING_DEFAULT_WARNING && opt_terse==FALSE) {
 | 
						|
    fprintf(stderr, "Warning, U_USING_DEFAULT_WARNING for %s\n", opt_locale);
 | 
						|
  }
 | 
						|
  if (status==U_USING_FALLBACK_WARNING && opt_terse==FALSE) {
 | 
						|
    fprintf(stderr, "Warning, U_USING_FALLBACK_ERROR for %s\n", opt_locale);
 | 
						|
  }
 | 
						|
 | 
						|
}
 | 
						|
 | 
						|
//---------------------------------------------------------------------------
 | 
						|
//
 | 
						|
//  ProcessOptions()    Function to read the command line options.
 | 
						|
//
 | 
						|
//---------------------------------------------------------------------------
 | 
						|
UBool ProcessOptions(int argc, const char **argv, OptSpec opts[])
 | 
						|
{
 | 
						|
    int         i;
 | 
						|
    int         argNum;
 | 
						|
    const char  *pArgName;
 | 
						|
    OptSpec    *pOpt;
 | 
						|
 | 
						|
    for (argNum=1; argNum<argc; argNum++) {
 | 
						|
        pArgName = argv[argNum];
 | 
						|
        for (pOpt = opts;  pOpt->name != 0; pOpt++) {
 | 
						|
            if (strcmp(pOpt->name, pArgName) == 0) {
 | 
						|
                switch (pOpt->type) {
 | 
						|
                case OptSpec::FLAG:
 | 
						|
                    *(UBool *)(pOpt->pVar) = TRUE;
 | 
						|
                    break;
 | 
						|
                case OptSpec::STRING:
 | 
						|
                    argNum ++;
 | 
						|
                    if (argNum >= argc) {
 | 
						|
                        fprintf(stderr, "value expected for \"%s\" option.\n", pOpt->name);
 | 
						|
                        return FALSE;
 | 
						|
                    }
 | 
						|
                    *(const char **)(pOpt->pVar)  = argv[argNum];
 | 
						|
                    break;
 | 
						|
                case OptSpec::NUM:
 | 
						|
                    argNum ++;
 | 
						|
                    if (argNum >= argc) {
 | 
						|
                        fprintf(stderr, "value expected for \"%s\" option.\n", pOpt->name);
 | 
						|
                        return FALSE;
 | 
						|
                    }
 | 
						|
                    char *endp;
 | 
						|
                    i = strtol(argv[argNum], &endp, 0);
 | 
						|
                    if (endp == argv[argNum]) {
 | 
						|
                        fprintf(stderr, "integer value expected for \"%s\" option.\n", pOpt->name);
 | 
						|
                        return FALSE;
 | 
						|
                    }
 | 
						|
                    *(int *)(pOpt->pVar) = i;
 | 
						|
                }
 | 
						|
                break;
 | 
						|
            }
 | 
						|
        }
 | 
						|
        if (pOpt->name == 0)
 | 
						|
        {
 | 
						|
            fprintf(stderr, "Unrecognized option \"%s\"\n", pArgName);
 | 
						|
            return FALSE;
 | 
						|
        }
 | 
						|
    }
 | 
						|
return TRUE;
 | 
						|
}
 | 
						|
 | 
						|
 | 
						|
void doForwardTest() {
 | 
						|
  if (opt_terse == FALSE) {
 | 
						|
    printf("Doing the forward test\n");
 | 
						|
  }
 | 
						|
  int32_t noBreaks = 0;
 | 
						|
  int32_t i = 0;
 | 
						|
  unsigned long startTime = timeGetTime();
 | 
						|
  unsigned long elapsedTime = 0;
 | 
						|
  if(opt_icu) {
 | 
						|
    createICUBrkIt();
 | 
						|
    brkit->setText(UnicodeString(text, textSize));
 | 
						|
    brkit->first();
 | 
						|
    if (opt_terse == FALSE) {
 | 
						|
      printf("Warmup\n");
 | 
						|
    }
 | 
						|
    int j;
 | 
						|
    while((j = brkit->next()) != BreakIterator::DONE) {
 | 
						|
      noBreaks++;
 | 
						|
      //fprintf(stderr, "%d ", j);
 | 
						|
    }
 | 
						|
  
 | 
						|
    if (opt_terse == FALSE) {
 | 
						|
      printf("Measure\n");
 | 
						|
    } 
 | 
						|
    startTime = timeGetTime();
 | 
						|
    for(i = 0; i < opt_loopCount; i++) {
 | 
						|
      brkit->first();  
 | 
						|
      while(brkit->next() != BreakIterator::DONE) {
 | 
						|
      }
 | 
						|
    }
 | 
						|
 | 
						|
    elapsedTime = timeGetTime()-startTime;
 | 
						|
  } else if(opt_mac) {
 | 
						|
#if U_PLATFORM_IS_DARWIN_BASED
 | 
						|
    createMACBrkIt();
 | 
						|
    UniChar* filePtr = text;
 | 
						|
    OSStatus status = noErr;
 | 
						|
    UniCharCount startOffset = 0, breakOffset = 0, numUniChars = textSize;
 | 
						|
    startOffset = 0;
 | 
						|
    //printf("\t---Search forward--\n");
 | 
						|
			
 | 
						|
    while (startOffset < numUniChars)
 | 
						|
    {
 | 
						|
	status = UCFindTextBreak(breakRef, macBreakType, kUCTextBreakLeadingEdgeMask, filePtr, numUniChars,
 | 
						|
                               startOffset, &breakOffset);
 | 
						|
      //require_action(status == noErr, EXIT, printf( "**UCFindTextBreak failed: startOffset %d, status %d\n", (int)startOffset, (int)status));
 | 
						|
      //require_action((breakOffset <= numUniChars),EXIT, printf("**UCFindTextBreak breakOffset too big: startOffset %d, breakOffset %d\n", (int)startOffset, (int)breakOffset));
 | 
						|
				
 | 
						|
      // Output break
 | 
						|
      //printf("\t%d\n", (int)breakOffset);
 | 
						|
				
 | 
						|
      // Increment counters
 | 
						|
	noBreaks++;
 | 
						|
      startOffset = breakOffset;
 | 
						|
    }
 | 
						|
    startTime = timeGetTime();
 | 
						|
    for(i = 0; i < opt_loopCount; i++) {
 | 
						|
      startOffset = 0;
 | 
						|
			
 | 
						|
      while (startOffset < numUniChars)
 | 
						|
	{
 | 
						|
	  status = UCFindTextBreak(breakRef, macBreakType, kUCTextBreakLeadingEdgeMask, filePtr, numUniChars,
 | 
						|
				   startOffset, &breakOffset);
 | 
						|
	  // Increment counters
 | 
						|
	  startOffset = breakOffset;
 | 
						|
	}
 | 
						|
    }
 | 
						|
    elapsedTime = timeGetTime()-startTime;
 | 
						|
    UCDisposeTextBreakLocator(&breakRef);
 | 
						|
#endif
 | 
						|
 | 
						|
 | 
						|
  }
 | 
						|
 | 
						|
 | 
						|
  if (opt_terse == FALSE) {
 | 
						|
  int32_t loopTime = (int)(float(1000) * ((float)elapsedTime/(float)opt_loopCount));
 | 
						|
      int32_t timePerCU = (int)(float(1000) * ((float)loopTime/(float)textSize));
 | 
						|
      int32_t timePerBreak = (int)(float(1000) * ((float)loopTime/(float)noBreaks));
 | 
						|
      printf("forward break iteration average loop time %d\n", loopTime);
 | 
						|
      printf("number of code units %d average time per code unit %d\n", textSize, timePerCU);
 | 
						|
      printf("number of breaks %d average time per break %d\n", noBreaks, timePerBreak);
 | 
						|
  } else {
 | 
						|
      printf("time=%d\nevents=%d\nsize=%d\n", elapsedTime, noBreaks, textSize);
 | 
						|
  }
 | 
						|
 | 
						|
 | 
						|
}
 | 
						|
 | 
						|
void doIsBoundTest() {
 | 
						|
  int32_t noBreaks = 0, hit = 0;
 | 
						|
  int32_t i = 0, j = 0;
 | 
						|
  unsigned long startTime = timeGetTime();
 | 
						|
  unsigned long elapsedTime = 0;
 | 
						|
  createICUBrkIt();
 | 
						|
  brkit->setText(UnicodeString(text, textSize));
 | 
						|
  brkit->first();
 | 
						|
  for(j = 0; j < textSize; j++) {
 | 
						|
    if(brkit->isBoundary(j)) {
 | 
						|
      noBreaks++;
 | 
						|
      //fprintf(stderr, "%d ", j);
 | 
						|
    }
 | 
						|
  }
 | 
						|
  /*
 | 
						|
  while(brkit->next() != BreakIterator::DONE) {
 | 
						|
    noBreaks++;
 | 
						|
  }
 | 
						|
  */
 | 
						|
  
 | 
						|
  startTime = timeGetTime();
 | 
						|
  for(i = 0; i < opt_loopCount; i++) {
 | 
						|
    for(j = 0; j < textSize; j++) {
 | 
						|
      if(brkit->isBoundary(j)) {
 | 
						|
        hit++;
 | 
						|
      }
 | 
						|
    }
 | 
						|
  }
 | 
						|
 | 
						|
  elapsedTime = timeGetTime()-startTime;
 | 
						|
  int32_t loopTime = (int)(float(1000) * ((float)elapsedTime/(float)opt_loopCount));
 | 
						|
  if (opt_terse == FALSE) {
 | 
						|
      int32_t timePerCU = (int)(float(1000) * ((float)loopTime/(float)textSize));
 | 
						|
      int32_t timePerBreak = (int)(float(1000) * ((float)loopTime/(float)noBreaks));
 | 
						|
      printf("forward break iteration average loop time %d\n", loopTime);
 | 
						|
      printf("number of code units %d average time per code unit %d\n", textSize, timePerCU);
 | 
						|
      printf("number of breaks %d average time per break %d\n", noBreaks, timePerBreak);
 | 
						|
  } else {
 | 
						|
      printf("time=%d\nevents=%d\nsize=%d\n", elapsedTime, noBreaks, textSize);
 | 
						|
  }
 | 
						|
}
 | 
						|
 | 
						|
//----------------------------------------------------------------------------------------
 | 
						|
//
 | 
						|
//   UnixConvert   -- Convert the lines of the file to the encoding for UNIX
 | 
						|
//                    Since it appears that Unicode support is going in the general
 | 
						|
//                    direction of the use of UTF-8 locales, that is the approach
 | 
						|
//                    that is used here.
 | 
						|
//
 | 
						|
//----------------------------------------------------------------------------------------
 | 
						|
void  UnixConvert() {
 | 
						|
#if 0
 | 
						|
    int    line;
 | 
						|
 | 
						|
    UConverter   *cvrtr;    // An ICU code page converter.
 | 
						|
    UErrorCode    status = U_ZERO_ERROR;
 | 
						|
 | 
						|
 | 
						|
    cvrtr = ucnv_open("utf-8", &status);    // we are just doing UTF-8 locales for now.
 | 
						|
    if (U_FAILURE(status)) {
 | 
						|
        fprintf(stderr, "ICU Converter open failed.: %d\n", &status);
 | 
						|
        exit(-1);
 | 
						|
    }
 | 
						|
    // redo for unix
 | 
						|
    for (line=0; line < gNumFileLines; line++) {
 | 
						|
        int sizeNeeded = ucnv_fromUChars(cvrtr,
 | 
						|
                                         0,            // ptr to target buffer.
 | 
						|
                                         0,            // length of target buffer.
 | 
						|
                                         gFileLines[line].name,
 | 
						|
                                         -1,           //  source is null terminated
 | 
						|
                                         &status);
 | 
						|
        if (status != U_BUFFER_OVERFLOW_ERROR && status != U_ZERO_ERROR) {
 | 
						|
            fprintf(stderr, "Conversion from Unicode, something is wrong.\n");
 | 
						|
            exit(-1);
 | 
						|
        }
 | 
						|
        status = U_ZERO_ERROR;
 | 
						|
        gFileLines[line].unixName = new char[sizeNeeded+1];
 | 
						|
        sizeNeeded = ucnv_fromUChars(cvrtr,
 | 
						|
                                         gFileLines[line].unixName, // ptr to target buffer.
 | 
						|
                                         sizeNeeded+1, // length of target buffer.
 | 
						|
                                         gFileLines[line].name,
 | 
						|
                                         -1,           //  source is null terminated
 | 
						|
                                         &status);
 | 
						|
        if (U_FAILURE(status)) {
 | 
						|
            fprintf(stderr, "ICU Conversion Failed.: %d\n", status);
 | 
						|
            exit(-1);
 | 
						|
        }
 | 
						|
        gFileLines[line].unixName[sizeNeeded] = 0;
 | 
						|
    };
 | 
						|
    ucnv_close(cvrtr);
 | 
						|
#endif
 | 
						|
}
 | 
						|
 | 
						|
 | 
						|
//----------------------------------------------------------------------------------------
 | 
						|
//
 | 
						|
//  class UCharFile   Class to hide all the gorp to read a file in
 | 
						|
//                    and produce a stream of UChars.
 | 
						|
//
 | 
						|
//----------------------------------------------------------------------------------------
 | 
						|
class UCharFile {
 | 
						|
public:
 | 
						|
    UCharFile(const char *fileName);
 | 
						|
    ~UCharFile();
 | 
						|
    UChar   get();
 | 
						|
    UBool   eof() {return fEof;};
 | 
						|
    UBool   error() {return fError;};
 | 
						|
    int32_t size() { return fFileSize; };
 | 
						|
    
 | 
						|
private:
 | 
						|
    UCharFile (const UCharFile &other) {};                         // No copy constructor.
 | 
						|
    UCharFile & operator = (const UCharFile &other) {return *this;};   // No assignment op
 | 
						|
 | 
						|
    FILE         *fFile;
 | 
						|
    const char   *fName;
 | 
						|
    UBool        fEof;
 | 
						|
    UBool        fError;
 | 
						|
    UChar        fPending2ndSurrogate;
 | 
						|
    int32_t      fFileSize;
 | 
						|
    
 | 
						|
    enum {UTF16LE, UTF16BE, UTF8} fEncoding;
 | 
						|
};
 | 
						|
 | 
						|
UCharFile::UCharFile(const char * fileName) {
 | 
						|
    fEof                 = FALSE;
 | 
						|
    fError               = FALSE;
 | 
						|
    fName                = fileName;
 | 
						|
    struct stat buf;
 | 
						|
    int32_t result = stat(fileName, &buf);
 | 
						|
    if(result != 0) {
 | 
						|
      fprintf(stderr, "Error getting info\n");
 | 
						|
      fFileSize = -1;
 | 
						|
    } else {
 | 
						|
      fFileSize = buf.st_size;
 | 
						|
    }
 | 
						|
    fFile                = fopen(fName, "rb");
 | 
						|
    fPending2ndSurrogate = 0;
 | 
						|
    if (fFile == NULL) {
 | 
						|
        fprintf(stderr, "Can not open file \"%s\"\n", opt_fName);
 | 
						|
        fError = TRUE;
 | 
						|
        return;
 | 
						|
    }
 | 
						|
    //
 | 
						|
    //  Look for the byte order mark at the start of the file.
 | 
						|
    //
 | 
						|
    int BOMC1, BOMC2, BOMC3;
 | 
						|
    BOMC1 = fgetc(fFile);
 | 
						|
    BOMC2 = fgetc(fFile);
 | 
						|
 | 
						|
    if (BOMC1 == 0xff && BOMC2 == 0xfe) {
 | 
						|
        fEncoding = UTF16LE; }
 | 
						|
    else if (BOMC1 == 0xfe && BOMC2 == 0xff) {
 | 
						|
        fEncoding = UTF16BE; }
 | 
						|
    else if (BOMC1 == 0xEF && BOMC2 == 0xBB && (BOMC3 = fgetc(fFile)) == 0xBF ) {
 | 
						|
        fEncoding = UTF8; }
 | 
						|
    else
 | 
						|
    {
 | 
						|
        fprintf(stderr, "collperf:  file \"%s\" encoding must be UTF-8 or UTF-16, and "
 | 
						|
            "must include a BOM.\n", fileName);
 | 
						|
        fError = true;
 | 
						|
        return;
 | 
						|
    }
 | 
						|
}
 | 
						|
 | 
						|
 | 
						|
UCharFile::~UCharFile() {
 | 
						|
    fclose(fFile);
 | 
						|
}
 | 
						|
 | 
						|
 | 
						|
 | 
						|
UChar UCharFile::get() {
 | 
						|
    UChar   c;
 | 
						|
    switch (fEncoding) {
 | 
						|
    case UTF16LE:
 | 
						|
        {
 | 
						|
            int  cL, cH;
 | 
						|
            cL = fgetc(fFile);
 | 
						|
            cH = fgetc(fFile);
 | 
						|
            c  = cL  | (cH << 8);
 | 
						|
            if (cH == EOF) {
 | 
						|
                c   = 0;
 | 
						|
                fEof = TRUE;
 | 
						|
            }
 | 
						|
            break;
 | 
						|
        }
 | 
						|
    case UTF16BE:
 | 
						|
        {
 | 
						|
            int  cL, cH;
 | 
						|
            cH = fgetc(fFile);
 | 
						|
            cL = fgetc(fFile);
 | 
						|
            c  = cL  | (cH << 8);
 | 
						|
            if (cL == EOF) {
 | 
						|
                c   = 0;
 | 
						|
                fEof = TRUE;
 | 
						|
            }
 | 
						|
            break;
 | 
						|
        }
 | 
						|
    case UTF8:
 | 
						|
        {
 | 
						|
            if (fPending2ndSurrogate != 0) {
 | 
						|
                c = fPending2ndSurrogate;
 | 
						|
                fPending2ndSurrogate = 0;
 | 
						|
                break;
 | 
						|
            }
 | 
						|
            
 | 
						|
            int ch = fgetc(fFile);   // Note:  c and ch are separate cause eof test doesn't work on UChar type.
 | 
						|
            if (ch == EOF) {
 | 
						|
                c = 0;
 | 
						|
                fEof = TRUE;
 | 
						|
                break;
 | 
						|
            }
 | 
						|
            
 | 
						|
            if (ch <= 0x7f) {
 | 
						|
                // It's ascii.  No further utf-8 conversion.
 | 
						|
                c = ch;
 | 
						|
                break;
 | 
						|
            }
 | 
						|
            
 | 
						|
            // Figure out the length of the char and read the rest of the bytes
 | 
						|
            //   into a temp array.
 | 
						|
            int nBytes;
 | 
						|
            if (ch >= 0xF0) {nBytes=4;}
 | 
						|
            else if (ch >= 0xE0) {nBytes=3;}
 | 
						|
            else if (ch >= 0xC0) {nBytes=2;}
 | 
						|
            else {
 | 
						|
                fprintf(stderr, "not likely utf-8 encoded file %s contains corrupt data at offset %d.\n", fName, ftell(fFile));
 | 
						|
                fError = TRUE;
 | 
						|
                return 0;
 | 
						|
            }
 | 
						|
            
 | 
						|
            unsigned char  bytes[10];
 | 
						|
            bytes[0] = (unsigned char)ch;
 | 
						|
            int i;
 | 
						|
            for (i=1; i<nBytes; i++) {
 | 
						|
                bytes[i] = fgetc(fFile);
 | 
						|
                if (bytes[i] < 0x80 || bytes[i] >= 0xc0) {
 | 
						|
                    fprintf(stderr, "utf-8 encoded file %s contains corrupt data at offset %d. Expected %d bytes, byte %d is invalid. First byte is %02X\n", fName, ftell(fFile), nBytes, i, ch);
 | 
						|
                    fError = TRUE;
 | 
						|
                    return 0;
 | 
						|
                }
 | 
						|
            }
 | 
						|
            
 | 
						|
            // Convert the bytes from the temp array to a Unicode char.
 | 
						|
            i = 0;
 | 
						|
            uint32_t  cp;
 | 
						|
            U8_NEXT_UNSAFE(bytes, i, cp);
 | 
						|
            c = (UChar)cp;
 | 
						|
            
 | 
						|
            if (cp >= 0x10000) {
 | 
						|
                // The code point needs to be broken up into a utf-16 surrogate pair.
 | 
						|
                //  Process first half this time through the main loop, and
 | 
						|
                //   remember the other half for the next time through.
 | 
						|
                UChar utf16Buf[3];
 | 
						|
                i = 0;
 | 
						|
                UTF16_APPEND_CHAR_UNSAFE(utf16Buf, i, cp);
 | 
						|
                fPending2ndSurrogate = utf16Buf[1];
 | 
						|
                c = utf16Buf[0];
 | 
						|
            }
 | 
						|
            break;
 | 
						|
        };
 | 
						|
    }
 | 
						|
    return c;
 | 
						|
}
 | 
						|
 | 
						|
 | 
						|
//----------------------------------------------------------------------------------------
 | 
						|
//
 | 
						|
//    Main   --  process command line, read in and pre-process the test file,
 | 
						|
//                 call other functions to do the actual tests.
 | 
						|
//
 | 
						|
//----------------------------------------------------------------------------------------
 | 
						|
int main(int argc, const char** argv) {
 | 
						|
    if (ProcessOptions(argc, argv, opts) != TRUE || opt_help || opt_fName == 0) {
 | 
						|
        printf(gUsageString);
 | 
						|
        exit (1);
 | 
						|
    }
 | 
						|
    // Make sure that we've only got one API selected.
 | 
						|
    if (opt_mac || opt_unix || opt_win) opt_icu = FALSE;
 | 
						|
    if (opt_mac || opt_unix) opt_win = FALSE;
 | 
						|
    if (opt_mac) opt_unix = FALSE;
 | 
						|
 | 
						|
    UErrorCode          status = U_ZERO_ERROR;
 | 
						|
 | 
						|
 | 
						|
 | 
						|
    //
 | 
						|
    //  Set up a Windows LCID
 | 
						|
    //
 | 
						|
  /*
 | 
						|
    if (opt_langid != 0) {
 | 
						|
        gWinLCID = MAKELCID(opt_langid, SORT_DEFAULT);
 | 
						|
    }
 | 
						|
    else {
 | 
						|
        gWinLCID = uloc_getLCID(opt_locale);
 | 
						|
    }
 | 
						|
  */
 | 
						|
 | 
						|
    //
 | 
						|
    //  Set the UNIX locale
 | 
						|
    //
 | 
						|
    if (opt_unix) {
 | 
						|
        if (setlocale(LC_ALL, opt_locale) == 0) {
 | 
						|
            fprintf(stderr, "setlocale(LC_ALL, %s) failed.\n", opt_locale);
 | 
						|
            exit(-1);
 | 
						|
        }
 | 
						|
    }
 | 
						|
 | 
						|
    // Read in  the input file.
 | 
						|
    //   File assumed to be utf-16.
 | 
						|
    //   Lines go onto heap buffers.  Global index array to line starts is created.
 | 
						|
    //   Lines themselves are null terminated.
 | 
						|
    //
 | 
						|
 | 
						|
    UCharFile f(opt_fName);
 | 
						|
    if (f.error()) {
 | 
						|
        exit(-1);
 | 
						|
    }
 | 
						|
    int32_t fileSize = f.size();
 | 
						|
    const int STARTSIZE = 70000;
 | 
						|
    int32_t bufSize = 0;
 | 
						|
    int32_t charCount = 0;
 | 
						|
    if(fileSize != -1) {
 | 
						|
      text = (UChar *)malloc(fileSize*sizeof(UChar));
 | 
						|
      bufSize = fileSize;
 | 
						|
    } else {
 | 
						|
      text = (UChar *)malloc(STARTSIZE*sizeof(UChar));
 | 
						|
      bufSize = STARTSIZE;
 | 
						|
    }
 | 
						|
    if(text == NULL) {
 | 
						|
      fprintf(stderr, "Allocating buffer failed\n");
 | 
						|
      exit(-1);
 | 
						|
    }
 | 
						|
    
 | 
						|
 | 
						|
    //  Read the file, split into lines, and save in memory.
 | 
						|
    //  Loop runs once per utf-16 value from the input file,
 | 
						|
    //    (The number of bytes read from file per loop iteration depends on external encoding.)
 | 
						|
    for (;;) {
 | 
						|
 | 
						|
        UChar c = f.get();
 | 
						|
        if(f.eof()) {
 | 
						|
          break;
 | 
						|
        }
 | 
						|
        if (f.error()){
 | 
						|
          exit(-1);
 | 
						|
        }
 | 
						|
        // We now have a good UTF-16 value in c.
 | 
						|
        text[charCount++] = c;
 | 
						|
        if(charCount == bufSize) {
 | 
						|
          text = (UChar *)realloc(text, 2*bufSize*sizeof(UChar));
 | 
						|
          if(text == NULL) {
 | 
						|
            fprintf(stderr, "Reallocating buffer failed\n");
 | 
						|
            exit(-1);
 | 
						|
          }
 | 
						|
          bufSize *= 2;
 | 
						|
        }
 | 
						|
    }
 | 
						|
 | 
						|
 | 
						|
    if (opt_terse == FALSE) {
 | 
						|
        printf("file \"%s\", %d charCount code units.\n", opt_fName, charCount);
 | 
						|
    }
 | 
						|
 | 
						|
    textSize = charCount;
 | 
						|
 | 
						|
 | 
						|
 | 
						|
 | 
						|
    //
 | 
						|
    //  Dump file contents if requested.
 | 
						|
    //
 | 
						|
    if (opt_dump) {
 | 
						|
      // dump file, etc... possibly
 | 
						|
    }
 | 
						|
 | 
						|
 | 
						|
    //
 | 
						|
    //  We've got the file read into memory.  Go do something with it.
 | 
						|
    //
 | 
						|
    int32_t i = 0;
 | 
						|
    for(i = 0; i < opt_passesCount; i++) {
 | 
						|
      if(opt_loopCount != 0) {
 | 
						|
        if(opt_next) {
 | 
						|
          doForwardTest();
 | 
						|
        } else if(opt_isBound) {
 | 
						|
          doIsBoundTest();
 | 
						|
        } else {
 | 
						|
          doForwardTest();
 | 
						|
        }
 | 
						|
      } else if(opt_time != 0) {
 | 
						|
 | 
						|
      }
 | 
						|
    }
 | 
						|
 | 
						|
  if(text != NULL) {
 | 
						|
    free(text);
 | 
						|
  }
 | 
						|
    if(brkit != NULL) {
 | 
						|
      delete brkit;
 | 
						|
    }
 | 
						|
 | 
						|
    return 0;
 | 
						|
}
 |