Mercurial > repos > guerler > springsuite
annotate planemo/lib/python3.7/site-packages/chardet/mbcharsetprober.py @ 0:d30785e31577 draft
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
| author | guerler | 
|---|---|
| date | Fri, 31 Jul 2020 00:18:57 -0400 | 
| parents | |
| children | 
| rev | line source | 
|---|---|
| 
0
 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
1 ######################## BEGIN LICENSE BLOCK ######################## | 
| 
 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
2 # The Original Code is Mozilla Universal charset detector code. | 
| 
 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
3 # | 
| 
 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
4 # The Initial Developer of the Original Code is | 
| 
 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
5 # Netscape Communications Corporation. | 
| 
 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
6 # Portions created by the Initial Developer are Copyright (C) 2001 | 
| 
 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
7 # the Initial Developer. All Rights Reserved. | 
| 
 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
8 # | 
| 
 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
9 # Contributor(s): | 
| 
 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
10 # Mark Pilgrim - port to Python | 
| 
 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
11 # Shy Shalom - original C code | 
| 
 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
12 # Proofpoint, Inc. | 
| 
 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
13 # | 
| 
 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
14 # This library is free software; you can redistribute it and/or | 
| 
 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
15 # modify it under the terms of the GNU Lesser General Public | 
| 
 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
16 # License as published by the Free Software Foundation; either | 
| 
 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
17 # version 2.1 of the License, or (at your option) any later version. | 
| 
 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
18 # | 
| 
 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
19 # This library is distributed in the hope that it will be useful, | 
| 
 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
20 # but WITHOUT ANY WARRANTY; without even the implied warranty of | 
| 
 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
21 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | 
| 
 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
22 # Lesser General Public License for more details. | 
| 
 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
23 # | 
| 
 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
24 # You should have received a copy of the GNU Lesser General Public | 
| 
 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
25 # License along with this library; if not, write to the Free Software | 
| 
 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
26 # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA | 
| 
 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
27 # 02110-1301 USA | 
| 
 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
28 ######################### END LICENSE BLOCK ######################### | 
| 
 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
29 | 
| 
 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
30 from .charsetprober import CharSetProber | 
| 
 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
31 from .enums import ProbingState, MachineState | 
| 
 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
32 | 
| 
 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
33 | 
| 
 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
34 class MultiByteCharSetProber(CharSetProber): | 
| 
 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
35 """ | 
| 
 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
36 MultiByteCharSetProber | 
| 
 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
37 """ | 
| 
 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
38 | 
| 
 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
39 def __init__(self, lang_filter=None): | 
| 
 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
40 super(MultiByteCharSetProber, self).__init__(lang_filter=lang_filter) | 
| 
 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
41 self.distribution_analyzer = None | 
| 
 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
42 self.coding_sm = None | 
| 
 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
43 self._last_char = [0, 0] | 
| 
 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
44 | 
| 
 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
45 def reset(self): | 
| 
 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
46 super(MultiByteCharSetProber, self).reset() | 
| 
 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
47 if self.coding_sm: | 
| 
 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
48 self.coding_sm.reset() | 
| 
 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
49 if self.distribution_analyzer: | 
| 
 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
50 self.distribution_analyzer.reset() | 
| 
 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
51 self._last_char = [0, 0] | 
| 
 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
52 | 
| 
 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
53 @property | 
| 
 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
54 def charset_name(self): | 
| 
 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
55 raise NotImplementedError | 
| 
 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
56 | 
| 
 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
57 @property | 
| 
 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
58 def language(self): | 
| 
 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
59 raise NotImplementedError | 
| 
 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
60 | 
| 
 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
61 def feed(self, byte_str): | 
| 
 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
62 for i in range(len(byte_str)): | 
| 
 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
63 coding_state = self.coding_sm.next_state(byte_str[i]) | 
| 
 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
64 if coding_state == MachineState.ERROR: | 
| 
 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
65 self.logger.debug('%s %s prober hit error at byte %s', | 
| 
 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
66 self.charset_name, self.language, i) | 
| 
 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
67 self._state = ProbingState.NOT_ME | 
| 
 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
68 break | 
| 
 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
69 elif coding_state == MachineState.ITS_ME: | 
| 
 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
70 self._state = ProbingState.FOUND_IT | 
| 
 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
71 break | 
| 
 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
72 elif coding_state == MachineState.START: | 
| 
 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
73 char_len = self.coding_sm.get_current_charlen() | 
| 
 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
74 if i == 0: | 
| 
 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
75 self._last_char[1] = byte_str[0] | 
| 
 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
76 self.distribution_analyzer.feed(self._last_char, char_len) | 
| 
 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
77 else: | 
| 
 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
78 self.distribution_analyzer.feed(byte_str[i - 1:i + 1], | 
| 
 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
79 char_len) | 
| 
 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
80 | 
| 
 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
81 self._last_char[0] = byte_str[-1] | 
| 
 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
82 | 
| 
 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
83 if self.state == ProbingState.DETECTING: | 
| 
 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
84 if (self.distribution_analyzer.got_enough_data() and | 
| 
 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
85 (self.get_confidence() > self.SHORTCUT_THRESHOLD)): | 
| 
 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
86 self._state = ProbingState.FOUND_IT | 
| 
 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
87 | 
| 
 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
88 return self.state | 
| 
 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
89 | 
| 
 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
90 def get_confidence(self): | 
| 
 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
91 return self.distribution_analyzer.get_confidence() | 
