123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295 |
- // © 2016 and later: Unicode, Inc. and others.
- // License & terms of use: http://www.unicode.org/copyright.html
- /*
- **********************************************************************
- * Copyright (C) 2005-2015, International Business Machines
- * Corporation and others. All Rights Reserved.
- **********************************************************************
- */
- #ifndef __CSRSBCS_H
- #define __CSRSBCS_H
- #include "unicode/uobject.h"
- #if !UCONFIG_NO_CONVERSION
- #include "csrecog.h"
- U_NAMESPACE_BEGIN
- class NGramParser : public UMemory
- {
- private:
- int32_t ngram;
- const int32_t *ngramList;
- int32_t ngramCount;
- int32_t hitCount;
- protected:
- int32_t byteIndex;
- const uint8_t *charMap;
- void addByte(int32_t b);
- public:
- NGramParser(const int32_t *theNgramList, const uint8_t *theCharMap);
- virtual ~NGramParser();
- private:
- /*
- * Binary search for value in table, which must have exactly 64 entries.
- */
- int32_t search(const int32_t *table, int32_t value);
- void lookup(int32_t thisNgram);
-
- virtual int32_t nextByte(InputText *det);
- virtual void parseCharacters(InputText *det);
- public:
- int32_t parse(InputText *det);
- };
- #if !UCONFIG_ONLY_HTML_CONVERSION
- class NGramParser_IBM420 : public NGramParser
- {
- public:
- NGramParser_IBM420(const int32_t *theNgramList, const uint8_t *theCharMap);
- ~NGramParser_IBM420();
- private:
- int32_t alef;
- int32_t isLamAlef(int32_t b);
- int32_t nextByte(InputText *det);
- void parseCharacters(InputText *det);
- };
- #endif
- class CharsetRecog_sbcs : public CharsetRecognizer
- {
- public:
- CharsetRecog_sbcs();
- virtual ~CharsetRecog_sbcs();
- virtual const char *getName() const = 0;
- virtual UBool match(InputText *det, CharsetMatch *results) const = 0;
- virtual int32_t match_sbcs(InputText *det, const int32_t ngrams[], const uint8_t charMap[]) const;
- };
- class CharsetRecog_8859_1 : public CharsetRecog_sbcs
- {
- public:
- virtual ~CharsetRecog_8859_1();
- const char *getName() const;
- virtual UBool match(InputText *det, CharsetMatch *results) const;
- };
- class CharsetRecog_8859_2 : public CharsetRecog_sbcs
- {
- public:
- virtual ~CharsetRecog_8859_2();
- const char *getName() const;
- virtual UBool match(InputText *det, CharsetMatch *results) const;
- };
- class CharsetRecog_8859_5 : public CharsetRecog_sbcs
- {
- public:
- virtual ~CharsetRecog_8859_5();
- const char *getName() const;
- };
- class CharsetRecog_8859_6 : public CharsetRecog_sbcs
- {
- public:
- virtual ~CharsetRecog_8859_6();
- const char *getName() const;
- };
- class CharsetRecog_8859_7 : public CharsetRecog_sbcs
- {
- public:
- virtual ~CharsetRecog_8859_7();
- const char *getName() const;
- };
- class CharsetRecog_8859_8 : public CharsetRecog_sbcs
- {
- public:
- virtual ~CharsetRecog_8859_8();
-
- virtual const char *getName() const;
- };
- class CharsetRecog_8859_9 : public CharsetRecog_sbcs
- {
- public:
- virtual ~CharsetRecog_8859_9();
- const char *getName() const;
- };
- class CharsetRecog_8859_5_ru : public CharsetRecog_8859_5
- {
- public:
- virtual ~CharsetRecog_8859_5_ru();
- const char *getLanguage() const;
- virtual UBool match(InputText *det, CharsetMatch *results) const;
- };
- class CharsetRecog_8859_6_ar : public CharsetRecog_8859_6
- {
- public:
- virtual ~CharsetRecog_8859_6_ar();
- const char *getLanguage() const;
- virtual UBool match(InputText *det, CharsetMatch *results) const;
- };
- class CharsetRecog_8859_7_el : public CharsetRecog_8859_7
- {
- public:
- virtual ~CharsetRecog_8859_7_el();
- const char *getLanguage() const;
- virtual UBool match(InputText *det, CharsetMatch *results) const;
- };
- class CharsetRecog_8859_8_I_he : public CharsetRecog_8859_8
- {
- public:
- virtual ~CharsetRecog_8859_8_I_he();
-
- const char *getName() const;
- const char *getLanguage() const;
- virtual UBool match(InputText *det, CharsetMatch *results) const;
- };
- class CharsetRecog_8859_8_he : public CharsetRecog_8859_8
- {
- public:
- virtual ~CharsetRecog_8859_8_he ();
- const char *getLanguage() const;
- virtual UBool match(InputText *det, CharsetMatch *results) const;
- };
- class CharsetRecog_8859_9_tr : public CharsetRecog_8859_9
- {
- public:
- virtual ~CharsetRecog_8859_9_tr ();
- const char *getLanguage() const;
- virtual UBool match(InputText *det, CharsetMatch *results) const;
- };
- class CharsetRecog_windows_1256 : public CharsetRecog_sbcs
- {
- public:
- virtual ~CharsetRecog_windows_1256();
- const char *getName() const;
- const char *getLanguage() const;
- virtual UBool match(InputText *det, CharsetMatch *results) const;
- };
- class CharsetRecog_windows_1251 : public CharsetRecog_sbcs
- {
- public:
- virtual ~CharsetRecog_windows_1251();
- const char *getName() const;
- const char *getLanguage() const;
- virtual UBool match(InputText *det, CharsetMatch *results) const;
- };
- class CharsetRecog_KOI8_R : public CharsetRecog_sbcs
- {
- public:
- virtual ~CharsetRecog_KOI8_R();
- const char *getName() const;
- const char *getLanguage() const;
- virtual UBool match(InputText *det, CharsetMatch *results) const;
- };
- #if !UCONFIG_ONLY_HTML_CONVERSION
- class CharsetRecog_IBM424_he : public CharsetRecog_sbcs
- {
- public:
- virtual ~CharsetRecog_IBM424_he();
- const char *getLanguage() const;
- };
- class CharsetRecog_IBM424_he_rtl : public CharsetRecog_IBM424_he {
- public:
- virtual ~CharsetRecog_IBM424_he_rtl();
-
- const char *getName() const;
-
- virtual UBool match(InputText *det, CharsetMatch *results) const;
- };
- class CharsetRecog_IBM424_he_ltr : public CharsetRecog_IBM424_he {
- virtual ~CharsetRecog_IBM424_he_ltr();
-
- const char *getName() const;
-
- virtual UBool match(InputText *det, CharsetMatch *results) const;
- };
- class CharsetRecog_IBM420_ar : public CharsetRecog_sbcs
- {
- public:
- virtual ~CharsetRecog_IBM420_ar();
- const char *getLanguage() const;
- int32_t match_sbcs(InputText *det, const int32_t ngrams[], const uint8_t charMap[]) const;
-
- };
- class CharsetRecog_IBM420_ar_rtl : public CharsetRecog_IBM420_ar {
- public:
- virtual ~CharsetRecog_IBM420_ar_rtl();
-
- const char *getName() const;
-
- virtual UBool match(InputText *det, CharsetMatch *results) const;
- };
- class CharsetRecog_IBM420_ar_ltr : public CharsetRecog_IBM420_ar {
- virtual ~CharsetRecog_IBM420_ar_ltr();
-
- const char *getName() const;
-
- virtual UBool match(InputText *det, CharsetMatch *results) const;
- };
- #endif
- U_NAMESPACE_END
- #endif /* !UCONFIG_NO_CONVERSION */
- #endif /* __CSRSBCS_H */
|