mirror of
https://gitflic.ru/project/photopea-v2/photopea-v-2.git
synced 2026-06-20 15:41:13 +00:00
Mirror "Templates" and "Plugins" community pages
This commit is contained in:
+27
-4
@@ -1,9 +1,12 @@
|
||||
#!/bin/python3
|
||||
|
||||
import sys
|
||||
sys.path.insert(0,"_vendor")
|
||||
|
||||
import requests
|
||||
import os, sys
|
||||
import os
|
||||
import re
|
||||
import json
|
||||
sys.path.insert(0,"_vendor")
|
||||
from tqdm import tqdm
|
||||
from dataclasses import dataclass
|
||||
import glob
|
||||
@@ -30,13 +33,29 @@ urls = [
|
||||
"code/storages/deviceStorage.html",
|
||||
"code/storages/googledriveStorage.html",
|
||||
"code/storages/dropboxStorage.html",
|
||||
"rsrc/basic/fa_basic.csh"
|
||||
"rsrc/basic/fa_basic.csh",
|
||||
"img/nft.png",
|
||||
["templates/?type=0&rsrc=","templates/?type=0.html"],
|
||||
["templates/?type=1&rsrc=","templates/?type=1.html"],
|
||||
["templates/?type=2&rsrc=","templates/?type=2.html"],
|
||||
["templates/?type=3&rsrc=","templates/?type=3.html"],
|
||||
"templates/templates.js",
|
||||
"templates/templates.css"
|
||||
]
|
||||
|
||||
|
||||
|
||||
#Update files
|
||||
def dl_file(path):
|
||||
if isinstance(path,list):
|
||||
output=path[1]
|
||||
path=path[0]
|
||||
else:
|
||||
output=path
|
||||
path=path
|
||||
outfn = root + output
|
||||
if os.path.exists(outfn):
|
||||
return
|
||||
with tqdm(desc=path, unit="B", unit_scale=True) as progress_bar:
|
||||
r = requests.get(website + path, stream=True)
|
||||
progress_bar.total = int(r.headers.get("Content-Length", 0))
|
||||
@@ -45,7 +64,7 @@ def dl_file(path):
|
||||
progress_bar.desc += "ERROR: HTTP Status %d" % r.status_code
|
||||
return
|
||||
|
||||
outfn = root + path
|
||||
|
||||
os.makedirs(os.path.dirname(outfn), exist_ok=True)
|
||||
with open(outfn, "wb") as outf:
|
||||
for chunk in r.iter_content(chunk_size=1024):
|
||||
@@ -145,3 +164,7 @@ find_and_replace('code/storages/dropboxStorage.html', 'var redirectUri = window.
|
||||
find_and_replace('index.html','https://connect.facebook.net','')
|
||||
|
||||
find_and_replace('index.html','https://www.facebook.com','')
|
||||
|
||||
#Redirect dynamic pages to static equivalent
|
||||
find_and_replace('code/pp/pp.js','"&rsrc="','".html"')
|
||||
find_and_replace('code/pp/pp.js','"templates/?type="','"templates/%3Ftype="')
|
||||
|
||||
@@ -0,0 +1 @@
|
||||
pip
|
||||
@@ -0,0 +1,21 @@
|
||||
This package contains a modified version of ca-bundle.crt:
|
||||
|
||||
ca-bundle.crt -- Bundle of CA Root Certificates
|
||||
|
||||
Certificate data from Mozilla as of: Thu Nov 3 19:04:19 2011#
|
||||
This is a bundle of X.509 certificates of public Certificate Authorities
|
||||
(CA). These were automatically extracted from Mozilla's root certificates
|
||||
file (certdata.txt). This file can be found in the mozilla source tree:
|
||||
https://hg.mozilla.org/mozilla-central/file/tip/security/nss/lib/ckfw/builtins/certdata.txt
|
||||
It contains the certificates in PEM format and therefore
|
||||
can be directly used with curl / libcurl / php_curl, or with
|
||||
an Apache+mod_ssl webserver for SSL client authentication.
|
||||
Just configure this file as the SSLCACertificateFile.#
|
||||
|
||||
***** BEGIN LICENSE BLOCK *****
|
||||
This Source Code Form is subject to the terms of the Mozilla Public License,
|
||||
v. 2.0. If a copy of the MPL was not distributed with this file, You can obtain
|
||||
one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
***** END LICENSE BLOCK *****
|
||||
@(#) $RCSfile: certdata.txt,v $ $Revision: 1.80 $ $Date: 2011/11/03 15:11:58 $
|
||||
@@ -0,0 +1,83 @@
|
||||
Metadata-Version: 2.1
|
||||
Name: certifi
|
||||
Version: 2022.9.24
|
||||
Summary: Python package for providing Mozilla's CA Bundle.
|
||||
Home-page: https://github.com/certifi/python-certifi
|
||||
Author: Kenneth Reitz
|
||||
Author-email: me@kennethreitz.com
|
||||
License: MPL-2.0
|
||||
Project-URL: Source, https://github.com/certifi/python-certifi
|
||||
Platform: UNKNOWN
|
||||
Classifier: Development Status :: 5 - Production/Stable
|
||||
Classifier: Intended Audience :: Developers
|
||||
Classifier: License :: OSI Approved :: Mozilla Public License 2.0 (MPL 2.0)
|
||||
Classifier: Natural Language :: English
|
||||
Classifier: Programming Language :: Python
|
||||
Classifier: Programming Language :: Python :: 3
|
||||
Classifier: Programming Language :: Python :: 3 :: Only
|
||||
Classifier: Programming Language :: Python :: 3.6
|
||||
Classifier: Programming Language :: Python :: 3.7
|
||||
Classifier: Programming Language :: Python :: 3.8
|
||||
Classifier: Programming Language :: Python :: 3.9
|
||||
Classifier: Programming Language :: Python :: 3.10
|
||||
Classifier: Programming Language :: Python :: 3.11
|
||||
Requires-Python: >=3.6
|
||||
License-File: LICENSE
|
||||
|
||||
Certifi: Python SSL Certificates
|
||||
================================
|
||||
|
||||
Certifi provides Mozilla's carefully curated collection of Root Certificates for
|
||||
validating the trustworthiness of SSL certificates while verifying the identity
|
||||
of TLS hosts. It has been extracted from the `Requests`_ project.
|
||||
|
||||
Installation
|
||||
------------
|
||||
|
||||
``certifi`` is available on PyPI. Simply install it with ``pip``::
|
||||
|
||||
$ pip install certifi
|
||||
|
||||
Usage
|
||||
-----
|
||||
|
||||
To reference the installed certificate authority (CA) bundle, you can use the
|
||||
built-in function::
|
||||
|
||||
>>> import certifi
|
||||
|
||||
>>> certifi.where()
|
||||
'/usr/local/lib/python3.7/site-packages/certifi/cacert.pem'
|
||||
|
||||
Or from the command line::
|
||||
|
||||
$ python -m certifi
|
||||
/usr/local/lib/python3.7/site-packages/certifi/cacert.pem
|
||||
|
||||
Enjoy!
|
||||
|
||||
1024-bit Root Certificates
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
Browsers and certificate authorities have concluded that 1024-bit keys are
|
||||
unacceptably weak for certificates, particularly root certificates. For this
|
||||
reason, Mozilla has removed any weak (i.e. 1024-bit key) certificate from its
|
||||
bundle, replacing it with an equivalent strong (i.e. 2048-bit or greater key)
|
||||
certificate from the same CA. Because Mozilla removed these certificates from
|
||||
its bundle, ``certifi`` removed them as well.
|
||||
|
||||
In previous versions, ``certifi`` provided the ``certifi.old_where()`` function
|
||||
to intentionally re-add the 1024-bit roots back into your bundle. This was not
|
||||
recommended in production and therefore was removed at the end of 2018.
|
||||
|
||||
.. _`Requests`: https://requests.readthedocs.io/en/master/
|
||||
|
||||
Addition/Removal of Certificates
|
||||
--------------------------------
|
||||
|
||||
Certifi does not support any addition/removal or other modification of the
|
||||
CA trust store content. This project is intended to provide a reliable and
|
||||
highly portable root of trust to python deployments. Look to upstream projects
|
||||
for methods to use alternate trust.
|
||||
|
||||
|
||||
@@ -0,0 +1,14 @@
|
||||
certifi-2022.9.24.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
|
||||
certifi-2022.9.24.dist-info/LICENSE,sha256=oC9sY4-fuE0G93ZMOrCF2K9-2luTwWbaVDEkeQd8b7A,1052
|
||||
certifi-2022.9.24.dist-info/METADATA,sha256=33NAOmkqKTCb2u1Ys8Zth7ABWXfEuLgp-5gLp1yK_7A,2911
|
||||
certifi-2022.9.24.dist-info/RECORD,,
|
||||
certifi-2022.9.24.dist-info/WHEEL,sha256=ewwEueio1C2XeHTvT17n8dZUJgOvyCWCt0WVNLClP9o,92
|
||||
certifi-2022.9.24.dist-info/top_level.txt,sha256=KMu4vUCfsjLrkPbSNdgdekS-pVJzBAJFO__nI8NF6-U,8
|
||||
certifi/__init__.py,sha256=luDjIGxDSrQ9O0zthdz5Lnt069Z_7eR1GIEefEaf-Ys,94
|
||||
certifi/__main__.py,sha256=xBBoj905TUWBLRGANOcf7oi6e-3dMP4cEoG9OyMs11g,243
|
||||
certifi/__pycache__/__init__.cpython-310.pyc,,
|
||||
certifi/__pycache__/__main__.cpython-310.pyc,,
|
||||
certifi/__pycache__/core.cpython-310.pyc,,
|
||||
certifi/cacert.pem,sha256=3l8CcWt_qL42030rGieD3SLufICFX0bYtGhDl_EXVPI,286370
|
||||
certifi/core.py,sha256=lhewz0zFb2b4ULsQurElmloYwQoecjWzPqY67P8T7iM,4219
|
||||
certifi/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
||||
@@ -0,0 +1,5 @@
|
||||
Wheel-Version: 1.0
|
||||
Generator: bdist_wheel (0.37.0)
|
||||
Root-Is-Purelib: true
|
||||
Tag: py3-none-any
|
||||
|
||||
@@ -0,0 +1 @@
|
||||
certifi
|
||||
@@ -0,0 +1,4 @@
|
||||
from .core import contents, where
|
||||
|
||||
__all__ = ["contents", "where"]
|
||||
__version__ = "2022.09.24"
|
||||
@@ -0,0 +1,12 @@
|
||||
import argparse
|
||||
|
||||
from certifi import contents, where
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("-c", "--contents", action="store_true")
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.contents:
|
||||
print(contents())
|
||||
else:
|
||||
print(where())
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,108 @@
|
||||
"""
|
||||
certifi.py
|
||||
~~~~~~~~~~
|
||||
|
||||
This module returns the installation location of cacert.pem or its contents.
|
||||
"""
|
||||
import sys
|
||||
|
||||
|
||||
if sys.version_info >= (3, 11):
|
||||
|
||||
from importlib.resources import as_file, files
|
||||
|
||||
_CACERT_CTX = None
|
||||
_CACERT_PATH = None
|
||||
|
||||
def where() -> str:
|
||||
# This is slightly terrible, but we want to delay extracting the file
|
||||
# in cases where we're inside of a zipimport situation until someone
|
||||
# actually calls where(), but we don't want to re-extract the file
|
||||
# on every call of where(), so we'll do it once then store it in a
|
||||
# global variable.
|
||||
global _CACERT_CTX
|
||||
global _CACERT_PATH
|
||||
if _CACERT_PATH is None:
|
||||
# This is slightly janky, the importlib.resources API wants you to
|
||||
# manage the cleanup of this file, so it doesn't actually return a
|
||||
# path, it returns a context manager that will give you the path
|
||||
# when you enter it and will do any cleanup when you leave it. In
|
||||
# the common case of not needing a temporary file, it will just
|
||||
# return the file system location and the __exit__() is a no-op.
|
||||
#
|
||||
# We also have to hold onto the actual context manager, because
|
||||
# it will do the cleanup whenever it gets garbage collected, so
|
||||
# we will also store that at the global level as well.
|
||||
_CACERT_CTX = as_file(files("certifi").joinpath("cacert.pem"))
|
||||
_CACERT_PATH = str(_CACERT_CTX.__enter__())
|
||||
|
||||
return _CACERT_PATH
|
||||
|
||||
def contents() -> str:
|
||||
return files("certifi").joinpath("cacert.pem").read_text(encoding="ascii")
|
||||
|
||||
elif sys.version_info >= (3, 7):
|
||||
|
||||
from importlib.resources import path as get_path, read_text
|
||||
|
||||
_CACERT_CTX = None
|
||||
_CACERT_PATH = None
|
||||
|
||||
def where() -> str:
|
||||
# This is slightly terrible, but we want to delay extracting the
|
||||
# file in cases where we're inside of a zipimport situation until
|
||||
# someone actually calls where(), but we don't want to re-extract
|
||||
# the file on every call of where(), so we'll do it once then store
|
||||
# it in a global variable.
|
||||
global _CACERT_CTX
|
||||
global _CACERT_PATH
|
||||
if _CACERT_PATH is None:
|
||||
# This is slightly janky, the importlib.resources API wants you
|
||||
# to manage the cleanup of this file, so it doesn't actually
|
||||
# return a path, it returns a context manager that will give
|
||||
# you the path when you enter it and will do any cleanup when
|
||||
# you leave it. In the common case of not needing a temporary
|
||||
# file, it will just return the file system location and the
|
||||
# __exit__() is a no-op.
|
||||
#
|
||||
# We also have to hold onto the actual context manager, because
|
||||
# it will do the cleanup whenever it gets garbage collected, so
|
||||
# we will also store that at the global level as well.
|
||||
_CACERT_CTX = get_path("certifi", "cacert.pem")
|
||||
_CACERT_PATH = str(_CACERT_CTX.__enter__())
|
||||
|
||||
return _CACERT_PATH
|
||||
|
||||
def contents() -> str:
|
||||
return read_text("certifi", "cacert.pem", encoding="ascii")
|
||||
|
||||
else:
|
||||
import os
|
||||
import types
|
||||
from typing import Union
|
||||
|
||||
Package = Union[types.ModuleType, str]
|
||||
Resource = Union[str, "os.PathLike"]
|
||||
|
||||
# This fallback will work for Python versions prior to 3.7 that lack the
|
||||
# importlib.resources module but relies on the existing `where` function
|
||||
# so won't address issues with environments like PyOxidizer that don't set
|
||||
# __file__ on modules.
|
||||
def read_text(
|
||||
package: Package,
|
||||
resource: Resource,
|
||||
encoding: str = 'utf-8',
|
||||
errors: str = 'strict'
|
||||
) -> str:
|
||||
with open(where(), encoding=encoding) as data:
|
||||
return data.read()
|
||||
|
||||
# If we don't have importlib.resources, then we will just do the old logic
|
||||
# of assuming we're on the filesystem and munge the path directly.
|
||||
def where() -> str:
|
||||
f = os.path.dirname(__file__)
|
||||
|
||||
return os.path.join(f, "cacert.pem")
|
||||
|
||||
def contents() -> str:
|
||||
return read_text("certifi", "cacert.pem", encoding="ascii")
|
||||
@@ -0,0 +1 @@
|
||||
pip
|
||||
@@ -0,0 +1,21 @@
|
||||
MIT License
|
||||
|
||||
Copyright (c) 2019 TAHRI Ahmed R.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
||||
@@ -0,0 +1,269 @@
|
||||
Metadata-Version: 2.1
|
||||
Name: charset-normalizer
|
||||
Version: 2.1.1
|
||||
Summary: The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet.
|
||||
Home-page: https://github.com/ousret/charset_normalizer
|
||||
Author: Ahmed TAHRI @Ousret
|
||||
Author-email: ahmed.tahri@cloudnursery.dev
|
||||
License: MIT
|
||||
Project-URL: Bug Reports, https://github.com/Ousret/charset_normalizer/issues
|
||||
Project-URL: Documentation, https://charset-normalizer.readthedocs.io/en/latest
|
||||
Keywords: encoding,i18n,txt,text,charset,charset-detector,normalization,unicode,chardet
|
||||
Classifier: Development Status :: 5 - Production/Stable
|
||||
Classifier: License :: OSI Approved :: MIT License
|
||||
Classifier: Intended Audience :: Developers
|
||||
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
||||
Classifier: Operating System :: OS Independent
|
||||
Classifier: Programming Language :: Python
|
||||
Classifier: Programming Language :: Python :: 3
|
||||
Classifier: Programming Language :: Python :: 3.6
|
||||
Classifier: Programming Language :: Python :: 3.7
|
||||
Classifier: Programming Language :: Python :: 3.8
|
||||
Classifier: Programming Language :: Python :: 3.9
|
||||
Classifier: Programming Language :: Python :: 3.10
|
||||
Classifier: Programming Language :: Python :: 3.11
|
||||
Classifier: Topic :: Text Processing :: Linguistic
|
||||
Classifier: Topic :: Utilities
|
||||
Classifier: Programming Language :: Python :: Implementation :: PyPy
|
||||
Classifier: Typing :: Typed
|
||||
Requires-Python: >=3.6.0
|
||||
Description-Content-Type: text/markdown
|
||||
License-File: LICENSE
|
||||
Provides-Extra: unicode_backport
|
||||
Requires-Dist: unicodedata2 ; extra == 'unicode_backport'
|
||||
|
||||
|
||||
<h1 align="center">Charset Detection, for Everyone 👋 <a href="https://twitter.com/intent/tweet?text=The%20Real%20First%20Universal%20Charset%20%26%20Language%20Detector&url=https://www.github.com/Ousret/charset_normalizer&hashtags=python,encoding,chardet,developers"><img src="https://img.shields.io/twitter/url/http/shields.io.svg?style=social"/></a></h1>
|
||||
|
||||
<p align="center">
|
||||
<sup>The Real First Universal Charset Detector</sup><br>
|
||||
<a href="https://pypi.org/project/charset-normalizer">
|
||||
<img src="https://img.shields.io/pypi/pyversions/charset_normalizer.svg?orange=blue" />
|
||||
</a>
|
||||
<a href="https://codecov.io/gh/Ousret/charset_normalizer">
|
||||
<img src="https://codecov.io/gh/Ousret/charset_normalizer/branch/master/graph/badge.svg" />
|
||||
</a>
|
||||
<a href="https://pepy.tech/project/charset-normalizer/">
|
||||
<img alt="Download Count Total" src="https://pepy.tech/badge/charset-normalizer/month" />
|
||||
</a>
|
||||
</p>
|
||||
|
||||
> A library that helps you read text from an unknown charset encoding.<br /> Motivated by `chardet`,
|
||||
> I'm trying to resolve the issue by taking a new approach.
|
||||
> All IANA character set names for which the Python core library provides codecs are supported.
|
||||
|
||||
<p align="center">
|
||||
>>>>> <a href="https://charsetnormalizerweb.ousret.now.sh" target="_blank">👉 Try Me Online Now, Then Adopt Me 👈 </a> <<<<<
|
||||
</p>
|
||||
|
||||
This project offers you an alternative to **Universal Charset Encoding Detector**, also known as **Chardet**.
|
||||
|
||||
| Feature | [Chardet](https://github.com/chardet/chardet) | Charset Normalizer | [cChardet](https://github.com/PyYoshi/cChardet) |
|
||||
| ------------- | :-------------: | :------------------: | :------------------: |
|
||||
| `Fast` | ❌<br> | ✅<br> | ✅ <br> |
|
||||
| `Universal**` | ❌ | ✅ | ❌ |
|
||||
| `Reliable` **without** distinguishable standards | ❌ | ✅ | ✅ |
|
||||
| `Reliable` **with** distinguishable standards | ✅ | ✅ | ✅ |
|
||||
| `License` | LGPL-2.1<br>_restrictive_ | MIT | MPL-1.1<br>_restrictive_ |
|
||||
| `Native Python` | ✅ | ✅ | ❌ |
|
||||
| `Detect spoken language` | ❌ | ✅ | N/A |
|
||||
| `UnicodeDecodeError Safety` | ❌ | ✅ | ❌ |
|
||||
| `Whl Size` | 193.6 kB | 39.5 kB | ~200 kB |
|
||||
| `Supported Encoding` | 33 | :tada: [93](https://charset-normalizer.readthedocs.io/en/latest/user/support.html#supported-encodings) | 40
|
||||
|
||||
<p align="center">
|
||||
<img src="https://i.imgflip.com/373iay.gif" alt="Reading Normalized Text" width="226"/><img src="https://media.tenor.com/images/c0180f70732a18b4965448d33adba3d0/tenor.gif" alt="Cat Reading Text" width="200"/>
|
||||
|
||||
*\*\* : They are clearly using specific code for a specific encoding even if covering most of used one*<br>
|
||||
Did you got there because of the logs? See [https://charset-normalizer.readthedocs.io/en/latest/user/miscellaneous.html](https://charset-normalizer.readthedocs.io/en/latest/user/miscellaneous.html)
|
||||
|
||||
## ⭐ Your support
|
||||
|
||||
*Fork, test-it, star-it, submit your ideas! We do listen.*
|
||||
|
||||
## ⚡ Performance
|
||||
|
||||
This package offer better performance than its counterpart Chardet. Here are some numbers.
|
||||
|
||||
| Package | Accuracy | Mean per file (ms) | File per sec (est) |
|
||||
| ------------- | :-------------: | :------------------: | :------------------: |
|
||||
| [chardet](https://github.com/chardet/chardet) | 86 % | 200 ms | 5 file/sec |
|
||||
| charset-normalizer | **98 %** | **39 ms** | 26 file/sec |
|
||||
|
||||
| Package | 99th percentile | 95th percentile | 50th percentile |
|
||||
| ------------- | :-------------: | :------------------: | :------------------: |
|
||||
| [chardet](https://github.com/chardet/chardet) | 1200 ms | 287 ms | 23 ms |
|
||||
| charset-normalizer | 400 ms | 200 ms | 15 ms |
|
||||
|
||||
Chardet's performance on larger file (1MB+) are very poor. Expect huge difference on large payload.
|
||||
|
||||
> Stats are generated using 400+ files using default parameters. More details on used files, see GHA workflows.
|
||||
> And yes, these results might change at any time. The dataset can be updated to include more files.
|
||||
> The actual delays heavily depends on your CPU capabilities. The factors should remain the same.
|
||||
> Keep in mind that the stats are generous and that Chardet accuracy vs our is measured using Chardet initial capability
|
||||
> (eg. Supported Encoding) Challenge-them if you want.
|
||||
|
||||
[cchardet](https://github.com/PyYoshi/cChardet) is a non-native (cpp binding) and unmaintained faster alternative with
|
||||
a better accuracy than chardet but lower than this package. If speed is the most important factor, you should try it.
|
||||
|
||||
## ✨ Installation
|
||||
|
||||
Using PyPi for latest stable
|
||||
```sh
|
||||
pip install charset-normalizer -U
|
||||
```
|
||||
|
||||
If you want a more up-to-date `unicodedata` than the one available in your Python setup.
|
||||
```sh
|
||||
pip install charset-normalizer[unicode_backport] -U
|
||||
```
|
||||
|
||||
## 🚀 Basic Usage
|
||||
|
||||
### CLI
|
||||
This package comes with a CLI.
|
||||
|
||||
```
|
||||
usage: normalizer [-h] [-v] [-a] [-n] [-m] [-r] [-f] [-t THRESHOLD]
|
||||
file [file ...]
|
||||
|
||||
The Real First Universal Charset Detector. Discover originating encoding used
|
||||
on text file. Normalize text to unicode.
|
||||
|
||||
positional arguments:
|
||||
files File(s) to be analysed
|
||||
|
||||
optional arguments:
|
||||
-h, --help show this help message and exit
|
||||
-v, --verbose Display complementary information about file if any.
|
||||
Stdout will contain logs about the detection process.
|
||||
-a, --with-alternative
|
||||
Output complementary possibilities if any. Top-level
|
||||
JSON WILL be a list.
|
||||
-n, --normalize Permit to normalize input file. If not set, program
|
||||
does not write anything.
|
||||
-m, --minimal Only output the charset detected to STDOUT. Disabling
|
||||
JSON output.
|
||||
-r, --replace Replace file when trying to normalize it instead of
|
||||
creating a new one.
|
||||
-f, --force Replace file without asking if you are sure, use this
|
||||
flag with caution.
|
||||
-t THRESHOLD, --threshold THRESHOLD
|
||||
Define a custom maximum amount of chaos allowed in
|
||||
decoded content. 0. <= chaos <= 1.
|
||||
--version Show version information and exit.
|
||||
```
|
||||
|
||||
```bash
|
||||
normalizer ./data/sample.1.fr.srt
|
||||
```
|
||||
|
||||
:tada: Since version 1.4.0 the CLI produce easily usable stdout result in JSON format.
|
||||
|
||||
```json
|
||||
{
|
||||
"path": "/home/default/projects/charset_normalizer/data/sample.1.fr.srt",
|
||||
"encoding": "cp1252",
|
||||
"encoding_aliases": [
|
||||
"1252",
|
||||
"windows_1252"
|
||||
],
|
||||
"alternative_encodings": [
|
||||
"cp1254",
|
||||
"cp1256",
|
||||
"cp1258",
|
||||
"iso8859_14",
|
||||
"iso8859_15",
|
||||
"iso8859_16",
|
||||
"iso8859_3",
|
||||
"iso8859_9",
|
||||
"latin_1",
|
||||
"mbcs"
|
||||
],
|
||||
"language": "French",
|
||||
"alphabets": [
|
||||
"Basic Latin",
|
||||
"Latin-1 Supplement"
|
||||
],
|
||||
"has_sig_or_bom": false,
|
||||
"chaos": 0.149,
|
||||
"coherence": 97.152,
|
||||
"unicode_path": null,
|
||||
"is_preferred": true
|
||||
}
|
||||
```
|
||||
|
||||
### Python
|
||||
*Just print out normalized text*
|
||||
```python
|
||||
from charset_normalizer import from_path
|
||||
|
||||
results = from_path('./my_subtitle.srt')
|
||||
|
||||
print(str(results.best()))
|
||||
```
|
||||
|
||||
*Normalize any text file*
|
||||
```python
|
||||
from charset_normalizer import normalize
|
||||
try:
|
||||
normalize('./my_subtitle.srt') # should write to disk my_subtitle-***.srt
|
||||
except IOError as e:
|
||||
print('Sadly, we are unable to perform charset normalization.', str(e))
|
||||
```
|
||||
|
||||
*Upgrade your code without effort*
|
||||
```python
|
||||
from charset_normalizer import detect
|
||||
```
|
||||
|
||||
The above code will behave the same as **chardet**. We ensure that we offer the best (reasonable) BC result possible.
|
||||
|
||||
See the docs for advanced usage : [readthedocs.io](https://charset-normalizer.readthedocs.io/en/latest/)
|
||||
|
||||
## 😇 Why
|
||||
|
||||
When I started using Chardet, I noticed that it was not suited to my expectations, and I wanted to propose a
|
||||
reliable alternative using a completely different method. Also! I never back down on a good challenge!
|
||||
|
||||
I **don't care** about the **originating charset** encoding, because **two different tables** can
|
||||
produce **two identical rendered string.**
|
||||
What I want is to get readable text, the best I can.
|
||||
|
||||
In a way, **I'm brute forcing text decoding.** How cool is that ? 😎
|
||||
|
||||
Don't confuse package **ftfy** with charset-normalizer or chardet. ftfy goal is to repair unicode string whereas charset-normalizer to convert raw file in unknown encoding to unicode.
|
||||
|
||||
## 🍰 How
|
||||
|
||||
- Discard all charset encoding table that could not fit the binary content.
|
||||
- Measure chaos, or the mess once opened (by chunks) with a corresponding charset encoding.
|
||||
- Extract matches with the lowest mess detected.
|
||||
- Additionally, we measure coherence / probe for a language.
|
||||
|
||||
**Wait a minute**, what is chaos/mess and coherence according to **YOU ?**
|
||||
|
||||
*Chaos :* I opened hundred of text files, **written by humans**, with the wrong encoding table. **I observed**, then
|
||||
**I established** some ground rules about **what is obvious** when **it seems like** a mess.
|
||||
I know that my interpretation of what is chaotic is very subjective, feel free to contribute in order to
|
||||
improve or rewrite it.
|
||||
|
||||
*Coherence :* For each language there is on earth, we have computed ranked letter appearance occurrences (the best we can). So I thought
|
||||
that intel is worth something here. So I use those records against decoded text to check if I can detect intelligent design.
|
||||
|
||||
## ⚡ Known limitations
|
||||
|
||||
- Language detection is unreliable when text contains two or more languages sharing identical letters. (eg. HTML (english tags) + Turkish content (Sharing Latin characters))
|
||||
- Every charset detector heavily depends on sufficient content. In common cases, do not bother run detection on very tiny content.
|
||||
|
||||
## 👤 Contributing
|
||||
|
||||
Contributions, issues and feature requests are very much welcome.<br />
|
||||
Feel free to check [issues page](https://github.com/ousret/charset_normalizer/issues) if you want to contribute.
|
||||
|
||||
## 📝 License
|
||||
|
||||
Copyright © 2019 [Ahmed TAHRI @Ousret](https://github.com/Ousret).<br />
|
||||
This project is [MIT](https://github.com/Ousret/charset_normalizer/blob/master/LICENSE) licensed.
|
||||
|
||||
Characters frequencies used in this project © 2012 [Denny Vrandečić](http://simia.net/letters/)
|
||||
@@ -0,0 +1,33 @@
|
||||
../../bin/normalizer,sha256=qSpvGsyLwjZW3uUUIySX3JlQOmSuDLIwdxOnC53iQiU,243
|
||||
charset_normalizer-2.1.1.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
|
||||
charset_normalizer-2.1.1.dist-info/LICENSE,sha256=6zGgxaT7Cbik4yBV0lweX5w1iidS_vPNcgIT0cz-4kE,1070
|
||||
charset_normalizer-2.1.1.dist-info/METADATA,sha256=C99l12g4d1E9_UiW-mqPCWx7v2M_lYGWxy1GTOjXSsA,11942
|
||||
charset_normalizer-2.1.1.dist-info/RECORD,,
|
||||
charset_normalizer-2.1.1.dist-info/WHEEL,sha256=G16H4A3IeoQmnOrYV4ueZGKSjhipXx8zc8nu9FGlvMA,92
|
||||
charset_normalizer-2.1.1.dist-info/entry_points.txt,sha256=uYo8aIGLWv8YgWfSna5HnfY_En4pkF1w4bgawNAXzP0,76
|
||||
charset_normalizer-2.1.1.dist-info/top_level.txt,sha256=7ASyzePr8_xuZWJsnqJjIBtyV8vhEo0wBCv1MPRRi3Q,19
|
||||
charset_normalizer/__init__.py,sha256=jGhhf1IcOgCpZsr593E9fPvjWKnflVqHe_LwkOJjInU,1790
|
||||
charset_normalizer/__pycache__/__init__.cpython-310.pyc,,
|
||||
charset_normalizer/__pycache__/api.cpython-310.pyc,,
|
||||
charset_normalizer/__pycache__/cd.cpython-310.pyc,,
|
||||
charset_normalizer/__pycache__/constant.cpython-310.pyc,,
|
||||
charset_normalizer/__pycache__/legacy.cpython-310.pyc,,
|
||||
charset_normalizer/__pycache__/md.cpython-310.pyc,,
|
||||
charset_normalizer/__pycache__/models.cpython-310.pyc,,
|
||||
charset_normalizer/__pycache__/utils.cpython-310.pyc,,
|
||||
charset_normalizer/__pycache__/version.cpython-310.pyc,,
|
||||
charset_normalizer/api.py,sha256=euVPmjAMbjpqhEHPjfKtyy1mK52U0TOUBUQgM_Qy6eE,19191
|
||||
charset_normalizer/assets/__init__.py,sha256=r7aakPaRIc2FFG2mw2V8NOTvkl25_euKZ3wPf5SAVa4,15222
|
||||
charset_normalizer/assets/__pycache__/__init__.cpython-310.pyc,,
|
||||
charset_normalizer/cd.py,sha256=Pxdkbn4cy0iZF42KTb1FiWIqqKobuz_fDjGwc6JMNBc,10811
|
||||
charset_normalizer/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
||||
charset_normalizer/cli/__pycache__/__init__.cpython-310.pyc,,
|
||||
charset_normalizer/cli/__pycache__/normalizer.cpython-310.pyc,,
|
||||
charset_normalizer/cli/normalizer.py,sha256=FmD1RXeMpRBg_mjR0MaJhNUpM2qZ8wz2neAE7AayBeg,9521
|
||||
charset_normalizer/constant.py,sha256=NgU-pY8JH2a9lkVT8oKwAFmIUYNKOuSBwZgF9MrlNCM,19157
|
||||
charset_normalizer/legacy.py,sha256=XKeZOts_HdYQU_Jb3C9ZfOjY2CiUL132k9_nXer8gig,3384
|
||||
charset_normalizer/md.py,sha256=pZP8IVpSC82D8INA9Tf_y0ijJSRI-UIncZvLdfTWEd4,17642
|
||||
charset_normalizer/models.py,sha256=i68YdlSLTEI3EEBVXq8TLNAbyyjrLC2OWszc-OBAk9I,13167
|
||||
charset_normalizer/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
||||
charset_normalizer/utils.py,sha256=ykOznhcAeH-ODLBWJuI7t1nbwa1SAfN_bDYTCJGyh4U,11771
|
||||
charset_normalizer/version.py,sha256=_eh2MA3qS__IajlePQxKBmlw6zaBDvPYlLdEgxgIojw,79
|
||||
@@ -0,0 +1,5 @@
|
||||
Wheel-Version: 1.0
|
||||
Generator: bdist_wheel (0.37.1)
|
||||
Root-Is-Purelib: true
|
||||
Tag: py3-none-any
|
||||
|
||||
@@ -0,0 +1,2 @@
|
||||
[console_scripts]
|
||||
normalizer = charset_normalizer.cli.normalizer:cli_detect
|
||||
@@ -0,0 +1 @@
|
||||
charset_normalizer
|
||||
@@ -0,0 +1,56 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Charset-Normalizer
|
||||
~~~~~~~~~~~~~~
|
||||
The Real First Universal Charset Detector.
|
||||
A library that helps you read text from an unknown charset encoding.
|
||||
Motivated by chardet, This package is trying to resolve the issue by taking a new approach.
|
||||
All IANA character set names for which the Python core library provides codecs are supported.
|
||||
|
||||
Basic usage:
|
||||
>>> from charset_normalizer import from_bytes
|
||||
>>> results = from_bytes('Bсеки човек има право на образование. Oбразованието!'.encode('utf_8'))
|
||||
>>> best_guess = results.best()
|
||||
>>> str(best_guess)
|
||||
'Bсеки човек има право на образование. Oбразованието!'
|
||||
|
||||
Others methods and usages are available - see the full documentation
|
||||
at <https://github.com/Ousret/charset_normalizer>.
|
||||
:copyright: (c) 2021 by Ahmed TAHRI
|
||||
:license: MIT, see LICENSE for more details.
|
||||
"""
|
||||
import logging
|
||||
|
||||
from .api import from_bytes, from_fp, from_path, normalize
|
||||
from .legacy import (
|
||||
CharsetDetector,
|
||||
CharsetDoctor,
|
||||
CharsetNormalizerMatch,
|
||||
CharsetNormalizerMatches,
|
||||
detect,
|
||||
)
|
||||
from .models import CharsetMatch, CharsetMatches
|
||||
from .utils import set_logging_handler
|
||||
from .version import VERSION, __version__
|
||||
|
||||
__all__ = (
|
||||
"from_fp",
|
||||
"from_path",
|
||||
"from_bytes",
|
||||
"normalize",
|
||||
"detect",
|
||||
"CharsetMatch",
|
||||
"CharsetMatches",
|
||||
"CharsetNormalizerMatch",
|
||||
"CharsetNormalizerMatches",
|
||||
"CharsetDetector",
|
||||
"CharsetDoctor",
|
||||
"__version__",
|
||||
"VERSION",
|
||||
"set_logging_handler",
|
||||
)
|
||||
|
||||
# Attach a NullHandler to the top level logger by default
|
||||
# https://docs.python.org/3.3/howto/logging.html#configuring-logging-for-a-library
|
||||
|
||||
logging.getLogger("charset_normalizer").addHandler(logging.NullHandler())
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,584 @@
|
||||
import logging
|
||||
import warnings
|
||||
from os import PathLike
|
||||
from os.path import basename, splitext
|
||||
from typing import Any, BinaryIO, List, Optional, Set
|
||||
|
||||
from .cd import (
|
||||
coherence_ratio,
|
||||
encoding_languages,
|
||||
mb_encoding_languages,
|
||||
merge_coherence_ratios,
|
||||
)
|
||||
from .constant import IANA_SUPPORTED, TOO_BIG_SEQUENCE, TOO_SMALL_SEQUENCE, TRACE
|
||||
from .md import mess_ratio
|
||||
from .models import CharsetMatch, CharsetMatches
|
||||
from .utils import (
|
||||
any_specified_encoding,
|
||||
cut_sequence_chunks,
|
||||
iana_name,
|
||||
identify_sig_or_bom,
|
||||
is_cp_similar,
|
||||
is_multi_byte_encoding,
|
||||
should_strip_sig_or_bom,
|
||||
)
|
||||
|
||||
# Will most likely be controversial
|
||||
# logging.addLevelName(TRACE, "TRACE")
|
||||
logger = logging.getLogger("charset_normalizer")
|
||||
explain_handler = logging.StreamHandler()
|
||||
explain_handler.setFormatter(
|
||||
logging.Formatter("%(asctime)s | %(levelname)s | %(message)s")
|
||||
)
|
||||
|
||||
|
||||
def from_bytes(
|
||||
sequences: bytes,
|
||||
steps: int = 5,
|
||||
chunk_size: int = 512,
|
||||
threshold: float = 0.2,
|
||||
cp_isolation: Optional[List[str]] = None,
|
||||
cp_exclusion: Optional[List[str]] = None,
|
||||
preemptive_behaviour: bool = True,
|
||||
explain: bool = False,
|
||||
) -> CharsetMatches:
|
||||
"""
|
||||
Given a raw bytes sequence, return the best possibles charset usable to render str objects.
|
||||
If there is no results, it is a strong indicator that the source is binary/not text.
|
||||
By default, the process will extract 5 blocs of 512o each to assess the mess and coherence of a given sequence.
|
||||
And will give up a particular code page after 20% of measured mess. Those criteria are customizable at will.
|
||||
|
||||
The preemptive behavior DOES NOT replace the traditional detection workflow, it prioritize a particular code page
|
||||
but never take it for granted. Can improve the performance.
|
||||
|
||||
You may want to focus your attention to some code page or/and not others, use cp_isolation and cp_exclusion for that
|
||||
purpose.
|
||||
|
||||
This function will strip the SIG in the payload/sequence every time except on UTF-16, UTF-32.
|
||||
By default the library does not setup any handler other than the NullHandler, if you choose to set the 'explain'
|
||||
toggle to True it will alter the logger configuration to add a StreamHandler that is suitable for debugging.
|
||||
Custom logging format and handler can be set manually.
|
||||
"""
|
||||
|
||||
if not isinstance(sequences, (bytearray, bytes)):
|
||||
raise TypeError(
|
||||
"Expected object of type bytes or bytearray, got: {0}".format(
|
||||
type(sequences)
|
||||
)
|
||||
)
|
||||
|
||||
if explain:
|
||||
previous_logger_level: int = logger.level
|
||||
logger.addHandler(explain_handler)
|
||||
logger.setLevel(TRACE)
|
||||
|
||||
length: int = len(sequences)
|
||||
|
||||
if length == 0:
|
||||
logger.debug("Encoding detection on empty bytes, assuming utf_8 intention.")
|
||||
if explain:
|
||||
logger.removeHandler(explain_handler)
|
||||
logger.setLevel(previous_logger_level or logging.WARNING)
|
||||
return CharsetMatches([CharsetMatch(sequences, "utf_8", 0.0, False, [], "")])
|
||||
|
||||
if cp_isolation is not None:
|
||||
logger.log(
|
||||
TRACE,
|
||||
"cp_isolation is set. use this flag for debugging purpose. "
|
||||
"limited list of encoding allowed : %s.",
|
||||
", ".join(cp_isolation),
|
||||
)
|
||||
cp_isolation = [iana_name(cp, False) for cp in cp_isolation]
|
||||
else:
|
||||
cp_isolation = []
|
||||
|
||||
if cp_exclusion is not None:
|
||||
logger.log(
|
||||
TRACE,
|
||||
"cp_exclusion is set. use this flag for debugging purpose. "
|
||||
"limited list of encoding excluded : %s.",
|
||||
", ".join(cp_exclusion),
|
||||
)
|
||||
cp_exclusion = [iana_name(cp, False) for cp in cp_exclusion]
|
||||
else:
|
||||
cp_exclusion = []
|
||||
|
||||
if length <= (chunk_size * steps):
|
||||
logger.log(
|
||||
TRACE,
|
||||
"override steps (%i) and chunk_size (%i) as content does not fit (%i byte(s) given) parameters.",
|
||||
steps,
|
||||
chunk_size,
|
||||
length,
|
||||
)
|
||||
steps = 1
|
||||
chunk_size = length
|
||||
|
||||
if steps > 1 and length / steps < chunk_size:
|
||||
chunk_size = int(length / steps)
|
||||
|
||||
is_too_small_sequence: bool = len(sequences) < TOO_SMALL_SEQUENCE
|
||||
is_too_large_sequence: bool = len(sequences) >= TOO_BIG_SEQUENCE
|
||||
|
||||
if is_too_small_sequence:
|
||||
logger.log(
|
||||
TRACE,
|
||||
"Trying to detect encoding from a tiny portion of ({}) byte(s).".format(
|
||||
length
|
||||
),
|
||||
)
|
||||
elif is_too_large_sequence:
|
||||
logger.log(
|
||||
TRACE,
|
||||
"Using lazy str decoding because the payload is quite large, ({}) byte(s).".format(
|
||||
length
|
||||
),
|
||||
)
|
||||
|
||||
prioritized_encodings: List[str] = []
|
||||
|
||||
specified_encoding: Optional[str] = (
|
||||
any_specified_encoding(sequences) if preemptive_behaviour else None
|
||||
)
|
||||
|
||||
if specified_encoding is not None:
|
||||
prioritized_encodings.append(specified_encoding)
|
||||
logger.log(
|
||||
TRACE,
|
||||
"Detected declarative mark in sequence. Priority +1 given for %s.",
|
||||
specified_encoding,
|
||||
)
|
||||
|
||||
tested: Set[str] = set()
|
||||
tested_but_hard_failure: List[str] = []
|
||||
tested_but_soft_failure: List[str] = []
|
||||
|
||||
fallback_ascii: Optional[CharsetMatch] = None
|
||||
fallback_u8: Optional[CharsetMatch] = None
|
||||
fallback_specified: Optional[CharsetMatch] = None
|
||||
|
||||
results: CharsetMatches = CharsetMatches()
|
||||
|
||||
sig_encoding, sig_payload = identify_sig_or_bom(sequences)
|
||||
|
||||
if sig_encoding is not None:
|
||||
prioritized_encodings.append(sig_encoding)
|
||||
logger.log(
|
||||
TRACE,
|
||||
"Detected a SIG or BOM mark on first %i byte(s). Priority +1 given for %s.",
|
||||
len(sig_payload),
|
||||
sig_encoding,
|
||||
)
|
||||
|
||||
prioritized_encodings.append("ascii")
|
||||
|
||||
if "utf_8" not in prioritized_encodings:
|
||||
prioritized_encodings.append("utf_8")
|
||||
|
||||
for encoding_iana in prioritized_encodings + IANA_SUPPORTED:
|
||||
|
||||
if cp_isolation and encoding_iana not in cp_isolation:
|
||||
continue
|
||||
|
||||
if cp_exclusion and encoding_iana in cp_exclusion:
|
||||
continue
|
||||
|
||||
if encoding_iana in tested:
|
||||
continue
|
||||
|
||||
tested.add(encoding_iana)
|
||||
|
||||
decoded_payload: Optional[str] = None
|
||||
bom_or_sig_available: bool = sig_encoding == encoding_iana
|
||||
strip_sig_or_bom: bool = bom_or_sig_available and should_strip_sig_or_bom(
|
||||
encoding_iana
|
||||
)
|
||||
|
||||
if encoding_iana in {"utf_16", "utf_32"} and not bom_or_sig_available:
|
||||
logger.log(
|
||||
TRACE,
|
||||
"Encoding %s wont be tested as-is because it require a BOM. Will try some sub-encoder LE/BE.",
|
||||
encoding_iana,
|
||||
)
|
||||
continue
|
||||
|
||||
try:
|
||||
is_multi_byte_decoder: bool = is_multi_byte_encoding(encoding_iana)
|
||||
except (ModuleNotFoundError, ImportError):
|
||||
logger.log(
|
||||
TRACE,
|
||||
"Encoding %s does not provide an IncrementalDecoder",
|
||||
encoding_iana,
|
||||
)
|
||||
continue
|
||||
|
||||
try:
|
||||
if is_too_large_sequence and is_multi_byte_decoder is False:
|
||||
str(
|
||||
sequences[: int(50e4)]
|
||||
if strip_sig_or_bom is False
|
||||
else sequences[len(sig_payload) : int(50e4)],
|
||||
encoding=encoding_iana,
|
||||
)
|
||||
else:
|
||||
decoded_payload = str(
|
||||
sequences
|
||||
if strip_sig_or_bom is False
|
||||
else sequences[len(sig_payload) :],
|
||||
encoding=encoding_iana,
|
||||
)
|
||||
except (UnicodeDecodeError, LookupError) as e:
|
||||
if not isinstance(e, LookupError):
|
||||
logger.log(
|
||||
TRACE,
|
||||
"Code page %s does not fit given bytes sequence at ALL. %s",
|
||||
encoding_iana,
|
||||
str(e),
|
||||
)
|
||||
tested_but_hard_failure.append(encoding_iana)
|
||||
continue
|
||||
|
||||
similar_soft_failure_test: bool = False
|
||||
|
||||
for encoding_soft_failed in tested_but_soft_failure:
|
||||
if is_cp_similar(encoding_iana, encoding_soft_failed):
|
||||
similar_soft_failure_test = True
|
||||
break
|
||||
|
||||
if similar_soft_failure_test:
|
||||
logger.log(
|
||||
TRACE,
|
||||
"%s is deemed too similar to code page %s and was consider unsuited already. Continuing!",
|
||||
encoding_iana,
|
||||
encoding_soft_failed,
|
||||
)
|
||||
continue
|
||||
|
||||
r_ = range(
|
||||
0 if not bom_or_sig_available else len(sig_payload),
|
||||
length,
|
||||
int(length / steps),
|
||||
)
|
||||
|
||||
multi_byte_bonus: bool = (
|
||||
is_multi_byte_decoder
|
||||
and decoded_payload is not None
|
||||
and len(decoded_payload) < length
|
||||
)
|
||||
|
||||
if multi_byte_bonus:
|
||||
logger.log(
|
||||
TRACE,
|
||||
"Code page %s is a multi byte encoding table and it appear that at least one character "
|
||||
"was encoded using n-bytes.",
|
||||
encoding_iana,
|
||||
)
|
||||
|
||||
max_chunk_gave_up: int = int(len(r_) / 4)
|
||||
|
||||
max_chunk_gave_up = max(max_chunk_gave_up, 2)
|
||||
early_stop_count: int = 0
|
||||
lazy_str_hard_failure = False
|
||||
|
||||
md_chunks: List[str] = []
|
||||
md_ratios = []
|
||||
|
||||
try:
|
||||
for chunk in cut_sequence_chunks(
|
||||
sequences,
|
||||
encoding_iana,
|
||||
r_,
|
||||
chunk_size,
|
||||
bom_or_sig_available,
|
||||
strip_sig_or_bom,
|
||||
sig_payload,
|
||||
is_multi_byte_decoder,
|
||||
decoded_payload,
|
||||
):
|
||||
md_chunks.append(chunk)
|
||||
|
||||
md_ratios.append(mess_ratio(chunk, threshold))
|
||||
|
||||
if md_ratios[-1] >= threshold:
|
||||
early_stop_count += 1
|
||||
|
||||
if (early_stop_count >= max_chunk_gave_up) or (
|
||||
bom_or_sig_available and strip_sig_or_bom is False
|
||||
):
|
||||
break
|
||||
except UnicodeDecodeError as e: # Lazy str loading may have missed something there
|
||||
logger.log(
|
||||
TRACE,
|
||||
"LazyStr Loading: After MD chunk decode, code page %s does not fit given bytes sequence at ALL. %s",
|
||||
encoding_iana,
|
||||
str(e),
|
||||
)
|
||||
early_stop_count = max_chunk_gave_up
|
||||
lazy_str_hard_failure = True
|
||||
|
||||
# We might want to check the sequence again with the whole content
|
||||
# Only if initial MD tests passes
|
||||
if (
|
||||
not lazy_str_hard_failure
|
||||
and is_too_large_sequence
|
||||
and not is_multi_byte_decoder
|
||||
):
|
||||
try:
|
||||
sequences[int(50e3) :].decode(encoding_iana, errors="strict")
|
||||
except UnicodeDecodeError as e:
|
||||
logger.log(
|
||||
TRACE,
|
||||
"LazyStr Loading: After final lookup, code page %s does not fit given bytes sequence at ALL. %s",
|
||||
encoding_iana,
|
||||
str(e),
|
||||
)
|
||||
tested_but_hard_failure.append(encoding_iana)
|
||||
continue
|
||||
|
||||
mean_mess_ratio: float = sum(md_ratios) / len(md_ratios) if md_ratios else 0.0
|
||||
if mean_mess_ratio >= threshold or early_stop_count >= max_chunk_gave_up:
|
||||
tested_but_soft_failure.append(encoding_iana)
|
||||
logger.log(
|
||||
TRACE,
|
||||
"%s was excluded because of initial chaos probing. Gave up %i time(s). "
|
||||
"Computed mean chaos is %f %%.",
|
||||
encoding_iana,
|
||||
early_stop_count,
|
||||
round(mean_mess_ratio * 100, ndigits=3),
|
||||
)
|
||||
# Preparing those fallbacks in case we got nothing.
|
||||
if (
|
||||
encoding_iana in ["ascii", "utf_8", specified_encoding]
|
||||
and not lazy_str_hard_failure
|
||||
):
|
||||
fallback_entry = CharsetMatch(
|
||||
sequences, encoding_iana, threshold, False, [], decoded_payload
|
||||
)
|
||||
if encoding_iana == specified_encoding:
|
||||
fallback_specified = fallback_entry
|
||||
elif encoding_iana == "ascii":
|
||||
fallback_ascii = fallback_entry
|
||||
else:
|
||||
fallback_u8 = fallback_entry
|
||||
continue
|
||||
|
||||
logger.log(
|
||||
TRACE,
|
||||
"%s passed initial chaos probing. Mean measured chaos is %f %%",
|
||||
encoding_iana,
|
||||
round(mean_mess_ratio * 100, ndigits=3),
|
||||
)
|
||||
|
||||
if not is_multi_byte_decoder:
|
||||
target_languages: List[str] = encoding_languages(encoding_iana)
|
||||
else:
|
||||
target_languages = mb_encoding_languages(encoding_iana)
|
||||
|
||||
if target_languages:
|
||||
logger.log(
|
||||
TRACE,
|
||||
"{} should target any language(s) of {}".format(
|
||||
encoding_iana, str(target_languages)
|
||||
),
|
||||
)
|
||||
|
||||
cd_ratios = []
|
||||
|
||||
# We shall skip the CD when its about ASCII
|
||||
# Most of the time its not relevant to run "language-detection" on it.
|
||||
if encoding_iana != "ascii":
|
||||
for chunk in md_chunks:
|
||||
chunk_languages = coherence_ratio(
|
||||
chunk, 0.1, ",".join(target_languages) if target_languages else None
|
||||
)
|
||||
|
||||
cd_ratios.append(chunk_languages)
|
||||
|
||||
cd_ratios_merged = merge_coherence_ratios(cd_ratios)
|
||||
|
||||
if cd_ratios_merged:
|
||||
logger.log(
|
||||
TRACE,
|
||||
"We detected language {} using {}".format(
|
||||
cd_ratios_merged, encoding_iana
|
||||
),
|
||||
)
|
||||
|
||||
results.append(
|
||||
CharsetMatch(
|
||||
sequences,
|
||||
encoding_iana,
|
||||
mean_mess_ratio,
|
||||
bom_or_sig_available,
|
||||
cd_ratios_merged,
|
||||
decoded_payload,
|
||||
)
|
||||
)
|
||||
|
||||
if (
|
||||
encoding_iana in [specified_encoding, "ascii", "utf_8"]
|
||||
and mean_mess_ratio < 0.1
|
||||
):
|
||||
logger.debug(
|
||||
"Encoding detection: %s is most likely the one.", encoding_iana
|
||||
)
|
||||
if explain:
|
||||
logger.removeHandler(explain_handler)
|
||||
logger.setLevel(previous_logger_level)
|
||||
return CharsetMatches([results[encoding_iana]])
|
||||
|
||||
if encoding_iana == sig_encoding:
|
||||
logger.debug(
|
||||
"Encoding detection: %s is most likely the one as we detected a BOM or SIG within "
|
||||
"the beginning of the sequence.",
|
||||
encoding_iana,
|
||||
)
|
||||
if explain:
|
||||
logger.removeHandler(explain_handler)
|
||||
logger.setLevel(previous_logger_level)
|
||||
return CharsetMatches([results[encoding_iana]])
|
||||
|
||||
if len(results) == 0:
|
||||
if fallback_u8 or fallback_ascii or fallback_specified:
|
||||
logger.log(
|
||||
TRACE,
|
||||
"Nothing got out of the detection process. Using ASCII/UTF-8/Specified fallback.",
|
||||
)
|
||||
|
||||
if fallback_specified:
|
||||
logger.debug(
|
||||
"Encoding detection: %s will be used as a fallback match",
|
||||
fallback_specified.encoding,
|
||||
)
|
||||
results.append(fallback_specified)
|
||||
elif (
|
||||
(fallback_u8 and fallback_ascii is None)
|
||||
or (
|
||||
fallback_u8
|
||||
and fallback_ascii
|
||||
and fallback_u8.fingerprint != fallback_ascii.fingerprint
|
||||
)
|
||||
or (fallback_u8 is not None)
|
||||
):
|
||||
logger.debug("Encoding detection: utf_8 will be used as a fallback match")
|
||||
results.append(fallback_u8)
|
||||
elif fallback_ascii:
|
||||
logger.debug("Encoding detection: ascii will be used as a fallback match")
|
||||
results.append(fallback_ascii)
|
||||
|
||||
if results:
|
||||
logger.debug(
|
||||
"Encoding detection: Found %s as plausible (best-candidate) for content. With %i alternatives.",
|
||||
results.best().encoding, # type: ignore
|
||||
len(results) - 1,
|
||||
)
|
||||
else:
|
||||
logger.debug("Encoding detection: Unable to determine any suitable charset.")
|
||||
|
||||
if explain:
|
||||
logger.removeHandler(explain_handler)
|
||||
logger.setLevel(previous_logger_level)
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def from_fp(
|
||||
fp: BinaryIO,
|
||||
steps: int = 5,
|
||||
chunk_size: int = 512,
|
||||
threshold: float = 0.20,
|
||||
cp_isolation: Optional[List[str]] = None,
|
||||
cp_exclusion: Optional[List[str]] = None,
|
||||
preemptive_behaviour: bool = True,
|
||||
explain: bool = False,
|
||||
) -> CharsetMatches:
|
||||
"""
|
||||
Same thing than the function from_bytes but using a file pointer that is already ready.
|
||||
Will not close the file pointer.
|
||||
"""
|
||||
return from_bytes(
|
||||
fp.read(),
|
||||
steps,
|
||||
chunk_size,
|
||||
threshold,
|
||||
cp_isolation,
|
||||
cp_exclusion,
|
||||
preemptive_behaviour,
|
||||
explain,
|
||||
)
|
||||
|
||||
|
||||
def from_path(
|
||||
path: "PathLike[Any]",
|
||||
steps: int = 5,
|
||||
chunk_size: int = 512,
|
||||
threshold: float = 0.20,
|
||||
cp_isolation: Optional[List[str]] = None,
|
||||
cp_exclusion: Optional[List[str]] = None,
|
||||
preemptive_behaviour: bool = True,
|
||||
explain: bool = False,
|
||||
) -> CharsetMatches:
|
||||
"""
|
||||
Same thing than the function from_bytes but with one extra step. Opening and reading given file path in binary mode.
|
||||
Can raise IOError.
|
||||
"""
|
||||
with open(path, "rb") as fp:
|
||||
return from_fp(
|
||||
fp,
|
||||
steps,
|
||||
chunk_size,
|
||||
threshold,
|
||||
cp_isolation,
|
||||
cp_exclusion,
|
||||
preemptive_behaviour,
|
||||
explain,
|
||||
)
|
||||
|
||||
|
||||
def normalize(
|
||||
path: "PathLike[Any]",
|
||||
steps: int = 5,
|
||||
chunk_size: int = 512,
|
||||
threshold: float = 0.20,
|
||||
cp_isolation: Optional[List[str]] = None,
|
||||
cp_exclusion: Optional[List[str]] = None,
|
||||
preemptive_behaviour: bool = True,
|
||||
) -> CharsetMatch:
|
||||
"""
|
||||
Take a (text-based) file path and try to create another file next to it, this time using UTF-8.
|
||||
"""
|
||||
warnings.warn(
|
||||
"normalize is deprecated and will be removed in 3.0",
|
||||
DeprecationWarning,
|
||||
)
|
||||
|
||||
results = from_path(
|
||||
path,
|
||||
steps,
|
||||
chunk_size,
|
||||
threshold,
|
||||
cp_isolation,
|
||||
cp_exclusion,
|
||||
preemptive_behaviour,
|
||||
)
|
||||
|
||||
filename = basename(path)
|
||||
target_extensions = list(splitext(filename))
|
||||
|
||||
if len(results) == 0:
|
||||
raise IOError(
|
||||
'Unable to normalize "{}", no encoding charset seems to fit.'.format(
|
||||
filename
|
||||
)
|
||||
)
|
||||
|
||||
result = results.best()
|
||||
|
||||
target_extensions[0] += "-" + result.encoding # type: ignore
|
||||
|
||||
with open(
|
||||
"{}".format(str(path).replace(filename, "".join(target_extensions))), "wb"
|
||||
) as fp:
|
||||
fp.write(result.output()) # type: ignore
|
||||
|
||||
return result # type: ignore
|
||||
File diff suppressed because it is too large
Load Diff
Binary file not shown.
@@ -0,0 +1,339 @@
|
||||
import importlib
|
||||
from codecs import IncrementalDecoder
|
||||
from collections import Counter
|
||||
from functools import lru_cache
|
||||
from typing import Counter as TypeCounter, Dict, List, Optional, Tuple
|
||||
|
||||
from .assets import FREQUENCIES
|
||||
from .constant import KO_NAMES, LANGUAGE_SUPPORTED_COUNT, TOO_SMALL_SEQUENCE, ZH_NAMES
|
||||
from .md import is_suspiciously_successive_range
|
||||
from .models import CoherenceMatches
|
||||
from .utils import (
|
||||
is_accentuated,
|
||||
is_latin,
|
||||
is_multi_byte_encoding,
|
||||
is_unicode_range_secondary,
|
||||
unicode_range,
|
||||
)
|
||||
|
||||
|
||||
def encoding_unicode_range(iana_name: str) -> List[str]:
|
||||
"""
|
||||
Return associated unicode ranges in a single byte code page.
|
||||
"""
|
||||
if is_multi_byte_encoding(iana_name):
|
||||
raise IOError("Function not supported on multi-byte code page")
|
||||
|
||||
decoder = importlib.import_module(
|
||||
"encodings.{}".format(iana_name)
|
||||
).IncrementalDecoder
|
||||
|
||||
p: IncrementalDecoder = decoder(errors="ignore")
|
||||
seen_ranges: Dict[str, int] = {}
|
||||
character_count: int = 0
|
||||
|
||||
for i in range(0x40, 0xFF):
|
||||
chunk: str = p.decode(bytes([i]))
|
||||
|
||||
if chunk:
|
||||
character_range: Optional[str] = unicode_range(chunk)
|
||||
|
||||
if character_range is None:
|
||||
continue
|
||||
|
||||
if is_unicode_range_secondary(character_range) is False:
|
||||
if character_range not in seen_ranges:
|
||||
seen_ranges[character_range] = 0
|
||||
seen_ranges[character_range] += 1
|
||||
character_count += 1
|
||||
|
||||
return sorted(
|
||||
[
|
||||
character_range
|
||||
for character_range in seen_ranges
|
||||
if seen_ranges[character_range] / character_count >= 0.15
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
def unicode_range_languages(primary_range: str) -> List[str]:
|
||||
"""
|
||||
Return inferred languages used with a unicode range.
|
||||
"""
|
||||
languages: List[str] = []
|
||||
|
||||
for language, characters in FREQUENCIES.items():
|
||||
for character in characters:
|
||||
if unicode_range(character) == primary_range:
|
||||
languages.append(language)
|
||||
break
|
||||
|
||||
return languages
|
||||
|
||||
|
||||
@lru_cache()
|
||||
def encoding_languages(iana_name: str) -> List[str]:
|
||||
"""
|
||||
Single-byte encoding language association. Some code page are heavily linked to particular language(s).
|
||||
This function does the correspondence.
|
||||
"""
|
||||
unicode_ranges: List[str] = encoding_unicode_range(iana_name)
|
||||
primary_range: Optional[str] = None
|
||||
|
||||
for specified_range in unicode_ranges:
|
||||
if "Latin" not in specified_range:
|
||||
primary_range = specified_range
|
||||
break
|
||||
|
||||
if primary_range is None:
|
||||
return ["Latin Based"]
|
||||
|
||||
return unicode_range_languages(primary_range)
|
||||
|
||||
|
||||
@lru_cache()
|
||||
def mb_encoding_languages(iana_name: str) -> List[str]:
|
||||
"""
|
||||
Multi-byte encoding language association. Some code page are heavily linked to particular language(s).
|
||||
This function does the correspondence.
|
||||
"""
|
||||
if (
|
||||
iana_name.startswith("shift_")
|
||||
or iana_name.startswith("iso2022_jp")
|
||||
or iana_name.startswith("euc_j")
|
||||
or iana_name == "cp932"
|
||||
):
|
||||
return ["Japanese"]
|
||||
if iana_name.startswith("gb") or iana_name in ZH_NAMES:
|
||||
return ["Chinese", "Classical Chinese"]
|
||||
if iana_name.startswith("iso2022_kr") or iana_name in KO_NAMES:
|
||||
return ["Korean"]
|
||||
|
||||
return []
|
||||
|
||||
|
||||
@lru_cache(maxsize=LANGUAGE_SUPPORTED_COUNT)
|
||||
def get_target_features(language: str) -> Tuple[bool, bool]:
|
||||
"""
|
||||
Determine main aspects from a supported language if it contains accents and if is pure Latin.
|
||||
"""
|
||||
target_have_accents: bool = False
|
||||
target_pure_latin: bool = True
|
||||
|
||||
for character in FREQUENCIES[language]:
|
||||
if not target_have_accents and is_accentuated(character):
|
||||
target_have_accents = True
|
||||
if target_pure_latin and is_latin(character) is False:
|
||||
target_pure_latin = False
|
||||
|
||||
return target_have_accents, target_pure_latin
|
||||
|
||||
|
||||
def alphabet_languages(
|
||||
characters: List[str], ignore_non_latin: bool = False
|
||||
) -> List[str]:
|
||||
"""
|
||||
Return associated languages associated to given characters.
|
||||
"""
|
||||
languages: List[Tuple[str, float]] = []
|
||||
|
||||
source_have_accents = any(is_accentuated(character) for character in characters)
|
||||
|
||||
for language, language_characters in FREQUENCIES.items():
|
||||
|
||||
target_have_accents, target_pure_latin = get_target_features(language)
|
||||
|
||||
if ignore_non_latin and target_pure_latin is False:
|
||||
continue
|
||||
|
||||
if target_have_accents is False and source_have_accents:
|
||||
continue
|
||||
|
||||
character_count: int = len(language_characters)
|
||||
|
||||
character_match_count: int = len(
|
||||
[c for c in language_characters if c in characters]
|
||||
)
|
||||
|
||||
ratio: float = character_match_count / character_count
|
||||
|
||||
if ratio >= 0.2:
|
||||
languages.append((language, ratio))
|
||||
|
||||
languages = sorted(languages, key=lambda x: x[1], reverse=True)
|
||||
|
||||
return [compatible_language[0] for compatible_language in languages]
|
||||
|
||||
|
||||
def characters_popularity_compare(
|
||||
language: str, ordered_characters: List[str]
|
||||
) -> float:
|
||||
"""
|
||||
Determine if a ordered characters list (by occurrence from most appearance to rarest) match a particular language.
|
||||
The result is a ratio between 0. (absolutely no correspondence) and 1. (near perfect fit).
|
||||
Beware that is function is not strict on the match in order to ease the detection. (Meaning close match is 1.)
|
||||
"""
|
||||
if language not in FREQUENCIES:
|
||||
raise ValueError("{} not available".format(language))
|
||||
|
||||
character_approved_count: int = 0
|
||||
FREQUENCIES_language_set = set(FREQUENCIES[language])
|
||||
|
||||
for character in ordered_characters:
|
||||
if character not in FREQUENCIES_language_set:
|
||||
continue
|
||||
|
||||
characters_before_source: List[str] = FREQUENCIES[language][
|
||||
0 : FREQUENCIES[language].index(character)
|
||||
]
|
||||
characters_after_source: List[str] = FREQUENCIES[language][
|
||||
FREQUENCIES[language].index(character) :
|
||||
]
|
||||
characters_before: List[str] = ordered_characters[
|
||||
0 : ordered_characters.index(character)
|
||||
]
|
||||
characters_after: List[str] = ordered_characters[
|
||||
ordered_characters.index(character) :
|
||||
]
|
||||
|
||||
before_match_count: int = len(
|
||||
set(characters_before) & set(characters_before_source)
|
||||
)
|
||||
|
||||
after_match_count: int = len(
|
||||
set(characters_after) & set(characters_after_source)
|
||||
)
|
||||
|
||||
if len(characters_before_source) == 0 and before_match_count <= 4:
|
||||
character_approved_count += 1
|
||||
continue
|
||||
|
||||
if len(characters_after_source) == 0 and after_match_count <= 4:
|
||||
character_approved_count += 1
|
||||
continue
|
||||
|
||||
if (
|
||||
before_match_count / len(characters_before_source) >= 0.4
|
||||
or after_match_count / len(characters_after_source) >= 0.4
|
||||
):
|
||||
character_approved_count += 1
|
||||
continue
|
||||
|
||||
return character_approved_count / len(ordered_characters)
|
||||
|
||||
|
||||
def alpha_unicode_split(decoded_sequence: str) -> List[str]:
|
||||
"""
|
||||
Given a decoded text sequence, return a list of str. Unicode range / alphabet separation.
|
||||
Ex. a text containing English/Latin with a bit a Hebrew will return two items in the resulting list;
|
||||
One containing the latin letters and the other hebrew.
|
||||
"""
|
||||
layers: Dict[str, str] = {}
|
||||
|
||||
for character in decoded_sequence:
|
||||
if character.isalpha() is False:
|
||||
continue
|
||||
|
||||
character_range: Optional[str] = unicode_range(character)
|
||||
|
||||
if character_range is None:
|
||||
continue
|
||||
|
||||
layer_target_range: Optional[str] = None
|
||||
|
||||
for discovered_range in layers:
|
||||
if (
|
||||
is_suspiciously_successive_range(discovered_range, character_range)
|
||||
is False
|
||||
):
|
||||
layer_target_range = discovered_range
|
||||
break
|
||||
|
||||
if layer_target_range is None:
|
||||
layer_target_range = character_range
|
||||
|
||||
if layer_target_range not in layers:
|
||||
layers[layer_target_range] = character.lower()
|
||||
continue
|
||||
|
||||
layers[layer_target_range] += character.lower()
|
||||
|
||||
return list(layers.values())
|
||||
|
||||
|
||||
def merge_coherence_ratios(results: List[CoherenceMatches]) -> CoherenceMatches:
|
||||
"""
|
||||
This function merge results previously given by the function coherence_ratio.
|
||||
The return type is the same as coherence_ratio.
|
||||
"""
|
||||
per_language_ratios: Dict[str, List[float]] = {}
|
||||
for result in results:
|
||||
for sub_result in result:
|
||||
language, ratio = sub_result
|
||||
if language not in per_language_ratios:
|
||||
per_language_ratios[language] = [ratio]
|
||||
continue
|
||||
per_language_ratios[language].append(ratio)
|
||||
|
||||
merge = [
|
||||
(
|
||||
language,
|
||||
round(
|
||||
sum(per_language_ratios[language]) / len(per_language_ratios[language]),
|
||||
4,
|
||||
),
|
||||
)
|
||||
for language in per_language_ratios
|
||||
]
|
||||
|
||||
return sorted(merge, key=lambda x: x[1], reverse=True)
|
||||
|
||||
|
||||
@lru_cache(maxsize=2048)
|
||||
def coherence_ratio(
|
||||
decoded_sequence: str, threshold: float = 0.1, lg_inclusion: Optional[str] = None
|
||||
) -> CoherenceMatches:
|
||||
"""
|
||||
Detect ANY language that can be identified in given sequence. The sequence will be analysed by layers.
|
||||
A layer = Character extraction by alphabets/ranges.
|
||||
"""
|
||||
|
||||
results: List[Tuple[str, float]] = []
|
||||
ignore_non_latin: bool = False
|
||||
|
||||
sufficient_match_count: int = 0
|
||||
|
||||
lg_inclusion_list = lg_inclusion.split(",") if lg_inclusion is not None else []
|
||||
if "Latin Based" in lg_inclusion_list:
|
||||
ignore_non_latin = True
|
||||
lg_inclusion_list.remove("Latin Based")
|
||||
|
||||
for layer in alpha_unicode_split(decoded_sequence):
|
||||
sequence_frequencies: TypeCounter[str] = Counter(layer)
|
||||
most_common = sequence_frequencies.most_common()
|
||||
|
||||
character_count: int = sum(o for c, o in most_common)
|
||||
|
||||
if character_count <= TOO_SMALL_SEQUENCE:
|
||||
continue
|
||||
|
||||
popular_character_ordered: List[str] = [c for c, o in most_common]
|
||||
|
||||
for language in lg_inclusion_list or alphabet_languages(
|
||||
popular_character_ordered, ignore_non_latin
|
||||
):
|
||||
ratio: float = characters_popularity_compare(
|
||||
language, popular_character_ordered
|
||||
)
|
||||
|
||||
if ratio < threshold:
|
||||
continue
|
||||
elif ratio >= 0.8:
|
||||
sufficient_match_count += 1
|
||||
|
||||
results.append((language, round(ratio, 4)))
|
||||
|
||||
if sufficient_match_count >= 3:
|
||||
break
|
||||
|
||||
return sorted(results, key=lambda x: x[1], reverse=True)
|
||||
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,295 @@
|
||||
import argparse
|
||||
import sys
|
||||
from json import dumps
|
||||
from os.path import abspath
|
||||
from platform import python_version
|
||||
from typing import List, Optional
|
||||
|
||||
try:
|
||||
from unicodedata2 import unidata_version
|
||||
except ImportError:
|
||||
from unicodedata import unidata_version
|
||||
|
||||
from charset_normalizer import from_fp
|
||||
from charset_normalizer.models import CliDetectionResult
|
||||
from charset_normalizer.version import __version__
|
||||
|
||||
|
||||
def query_yes_no(question: str, default: str = "yes") -> bool:
|
||||
"""Ask a yes/no question via input() and return their answer.
|
||||
|
||||
"question" is a string that is presented to the user.
|
||||
"default" is the presumed answer if the user just hits <Enter>.
|
||||
It must be "yes" (the default), "no" or None (meaning
|
||||
an answer is required of the user).
|
||||
|
||||
The "answer" return value is True for "yes" or False for "no".
|
||||
|
||||
Credit goes to (c) https://stackoverflow.com/questions/3041986/apt-command-line-interface-like-yes-no-input
|
||||
"""
|
||||
valid = {"yes": True, "y": True, "ye": True, "no": False, "n": False}
|
||||
if default is None:
|
||||
prompt = " [y/n] "
|
||||
elif default == "yes":
|
||||
prompt = " [Y/n] "
|
||||
elif default == "no":
|
||||
prompt = " [y/N] "
|
||||
else:
|
||||
raise ValueError("invalid default answer: '%s'" % default)
|
||||
|
||||
while True:
|
||||
sys.stdout.write(question + prompt)
|
||||
choice = input().lower()
|
||||
if default is not None and choice == "":
|
||||
return valid[default]
|
||||
elif choice in valid:
|
||||
return valid[choice]
|
||||
else:
|
||||
sys.stdout.write("Please respond with 'yes' or 'no' " "(or 'y' or 'n').\n")
|
||||
|
||||
|
||||
def cli_detect(argv: Optional[List[str]] = None) -> int:
|
||||
"""
|
||||
CLI assistant using ARGV and ArgumentParser
|
||||
:param argv:
|
||||
:return: 0 if everything is fine, anything else equal trouble
|
||||
"""
|
||||
parser = argparse.ArgumentParser(
|
||||
description="The Real First Universal Charset Detector. "
|
||||
"Discover originating encoding used on text file. "
|
||||
"Normalize text to unicode."
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"files", type=argparse.FileType("rb"), nargs="+", help="File(s) to be analysed"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-v",
|
||||
"--verbose",
|
||||
action="store_true",
|
||||
default=False,
|
||||
dest="verbose",
|
||||
help="Display complementary information about file if any. "
|
||||
"Stdout will contain logs about the detection process.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-a",
|
||||
"--with-alternative",
|
||||
action="store_true",
|
||||
default=False,
|
||||
dest="alternatives",
|
||||
help="Output complementary possibilities if any. Top-level JSON WILL be a list.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-n",
|
||||
"--normalize",
|
||||
action="store_true",
|
||||
default=False,
|
||||
dest="normalize",
|
||||
help="Permit to normalize input file. If not set, program does not write anything.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-m",
|
||||
"--minimal",
|
||||
action="store_true",
|
||||
default=False,
|
||||
dest="minimal",
|
||||
help="Only output the charset detected to STDOUT. Disabling JSON output.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-r",
|
||||
"--replace",
|
||||
action="store_true",
|
||||
default=False,
|
||||
dest="replace",
|
||||
help="Replace file when trying to normalize it instead of creating a new one.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-f",
|
||||
"--force",
|
||||
action="store_true",
|
||||
default=False,
|
||||
dest="force",
|
||||
help="Replace file without asking if you are sure, use this flag with caution.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-t",
|
||||
"--threshold",
|
||||
action="store",
|
||||
default=0.2,
|
||||
type=float,
|
||||
dest="threshold",
|
||||
help="Define a custom maximum amount of chaos allowed in decoded content. 0. <= chaos <= 1.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--version",
|
||||
action="version",
|
||||
version="Charset-Normalizer {} - Python {} - Unicode {}".format(
|
||||
__version__, python_version(), unidata_version
|
||||
),
|
||||
help="Show version information and exit.",
|
||||
)
|
||||
|
||||
args = parser.parse_args(argv)
|
||||
|
||||
if args.replace is True and args.normalize is False:
|
||||
print("Use --replace in addition of --normalize only.", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
if args.force is True and args.replace is False:
|
||||
print("Use --force in addition of --replace only.", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
if args.threshold < 0.0 or args.threshold > 1.0:
|
||||
print("--threshold VALUE should be between 0. AND 1.", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
x_ = []
|
||||
|
||||
for my_file in args.files:
|
||||
|
||||
matches = from_fp(my_file, threshold=args.threshold, explain=args.verbose)
|
||||
|
||||
best_guess = matches.best()
|
||||
|
||||
if best_guess is None:
|
||||
print(
|
||||
'Unable to identify originating encoding for "{}". {}'.format(
|
||||
my_file.name,
|
||||
"Maybe try increasing maximum amount of chaos."
|
||||
if args.threshold < 1.0
|
||||
else "",
|
||||
),
|
||||
file=sys.stderr,
|
||||
)
|
||||
x_.append(
|
||||
CliDetectionResult(
|
||||
abspath(my_file.name),
|
||||
None,
|
||||
[],
|
||||
[],
|
||||
"Unknown",
|
||||
[],
|
||||
False,
|
||||
1.0,
|
||||
0.0,
|
||||
None,
|
||||
True,
|
||||
)
|
||||
)
|
||||
else:
|
||||
x_.append(
|
||||
CliDetectionResult(
|
||||
abspath(my_file.name),
|
||||
best_guess.encoding,
|
||||
best_guess.encoding_aliases,
|
||||
[
|
||||
cp
|
||||
for cp in best_guess.could_be_from_charset
|
||||
if cp != best_guess.encoding
|
||||
],
|
||||
best_guess.language,
|
||||
best_guess.alphabets,
|
||||
best_guess.bom,
|
||||
best_guess.percent_chaos,
|
||||
best_guess.percent_coherence,
|
||||
None,
|
||||
True,
|
||||
)
|
||||
)
|
||||
|
||||
if len(matches) > 1 and args.alternatives:
|
||||
for el in matches:
|
||||
if el != best_guess:
|
||||
x_.append(
|
||||
CliDetectionResult(
|
||||
abspath(my_file.name),
|
||||
el.encoding,
|
||||
el.encoding_aliases,
|
||||
[
|
||||
cp
|
||||
for cp in el.could_be_from_charset
|
||||
if cp != el.encoding
|
||||
],
|
||||
el.language,
|
||||
el.alphabets,
|
||||
el.bom,
|
||||
el.percent_chaos,
|
||||
el.percent_coherence,
|
||||
None,
|
||||
False,
|
||||
)
|
||||
)
|
||||
|
||||
if args.normalize is True:
|
||||
|
||||
if best_guess.encoding.startswith("utf") is True:
|
||||
print(
|
||||
'"{}" file does not need to be normalized, as it already came from unicode.'.format(
|
||||
my_file.name
|
||||
),
|
||||
file=sys.stderr,
|
||||
)
|
||||
if my_file.closed is False:
|
||||
my_file.close()
|
||||
continue
|
||||
|
||||
o_: List[str] = my_file.name.split(".")
|
||||
|
||||
if args.replace is False:
|
||||
o_.insert(-1, best_guess.encoding)
|
||||
if my_file.closed is False:
|
||||
my_file.close()
|
||||
elif (
|
||||
args.force is False
|
||||
and query_yes_no(
|
||||
'Are you sure to normalize "{}" by replacing it ?'.format(
|
||||
my_file.name
|
||||
),
|
||||
"no",
|
||||
)
|
||||
is False
|
||||
):
|
||||
if my_file.closed is False:
|
||||
my_file.close()
|
||||
continue
|
||||
|
||||
try:
|
||||
x_[0].unicode_path = abspath("./{}".format(".".join(o_)))
|
||||
|
||||
with open(x_[0].unicode_path, "w", encoding="utf-8") as fp:
|
||||
fp.write(str(best_guess))
|
||||
except IOError as e:
|
||||
print(str(e), file=sys.stderr)
|
||||
if my_file.closed is False:
|
||||
my_file.close()
|
||||
return 2
|
||||
|
||||
if my_file.closed is False:
|
||||
my_file.close()
|
||||
|
||||
if args.minimal is False:
|
||||
print(
|
||||
dumps(
|
||||
[el.__dict__ for el in x_] if len(x_) > 1 else x_[0].__dict__,
|
||||
ensure_ascii=True,
|
||||
indent=4,
|
||||
)
|
||||
)
|
||||
else:
|
||||
for my_file in args.files:
|
||||
print(
|
||||
", ".join(
|
||||
[
|
||||
el.encoding or "undefined"
|
||||
for el in x_
|
||||
if el.path == abspath(my_file.name)
|
||||
]
|
||||
)
|
||||
)
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
cli_detect()
|
||||
@@ -0,0 +1,497 @@
|
||||
from codecs import BOM_UTF8, BOM_UTF16_BE, BOM_UTF16_LE, BOM_UTF32_BE, BOM_UTF32_LE
|
||||
from encodings.aliases import aliases
|
||||
from re import IGNORECASE, compile as re_compile
|
||||
from typing import Dict, List, Set, Union
|
||||
|
||||
from .assets import FREQUENCIES
|
||||
|
||||
# Contain for each eligible encoding a list of/item bytes SIG/BOM
|
||||
ENCODING_MARKS: Dict[str, Union[bytes, List[bytes]]] = {
|
||||
"utf_8": BOM_UTF8,
|
||||
"utf_7": [
|
||||
b"\x2b\x2f\x76\x38",
|
||||
b"\x2b\x2f\x76\x39",
|
||||
b"\x2b\x2f\x76\x2b",
|
||||
b"\x2b\x2f\x76\x2f",
|
||||
b"\x2b\x2f\x76\x38\x2d",
|
||||
],
|
||||
"gb18030": b"\x84\x31\x95\x33",
|
||||
"utf_32": [BOM_UTF32_BE, BOM_UTF32_LE],
|
||||
"utf_16": [BOM_UTF16_BE, BOM_UTF16_LE],
|
||||
}
|
||||
|
||||
TOO_SMALL_SEQUENCE: int = 32
|
||||
TOO_BIG_SEQUENCE: int = int(10e6)
|
||||
|
||||
UTF8_MAXIMAL_ALLOCATION: int = 1112064
|
||||
|
||||
UNICODE_RANGES_COMBINED: Dict[str, range] = {
|
||||
"Control character": range(31 + 1),
|
||||
"Basic Latin": range(32, 127 + 1),
|
||||
"Latin-1 Supplement": range(128, 255 + 1),
|
||||
"Latin Extended-A": range(256, 383 + 1),
|
||||
"Latin Extended-B": range(384, 591 + 1),
|
||||
"IPA Extensions": range(592, 687 + 1),
|
||||
"Spacing Modifier Letters": range(688, 767 + 1),
|
||||
"Combining Diacritical Marks": range(768, 879 + 1),
|
||||
"Greek and Coptic": range(880, 1023 + 1),
|
||||
"Cyrillic": range(1024, 1279 + 1),
|
||||
"Cyrillic Supplement": range(1280, 1327 + 1),
|
||||
"Armenian": range(1328, 1423 + 1),
|
||||
"Hebrew": range(1424, 1535 + 1),
|
||||
"Arabic": range(1536, 1791 + 1),
|
||||
"Syriac": range(1792, 1871 + 1),
|
||||
"Arabic Supplement": range(1872, 1919 + 1),
|
||||
"Thaana": range(1920, 1983 + 1),
|
||||
"NKo": range(1984, 2047 + 1),
|
||||
"Samaritan": range(2048, 2111 + 1),
|
||||
"Mandaic": range(2112, 2143 + 1),
|
||||
"Syriac Supplement": range(2144, 2159 + 1),
|
||||
"Arabic Extended-A": range(2208, 2303 + 1),
|
||||
"Devanagari": range(2304, 2431 + 1),
|
||||
"Bengali": range(2432, 2559 + 1),
|
||||
"Gurmukhi": range(2560, 2687 + 1),
|
||||
"Gujarati": range(2688, 2815 + 1),
|
||||
"Oriya": range(2816, 2943 + 1),
|
||||
"Tamil": range(2944, 3071 + 1),
|
||||
"Telugu": range(3072, 3199 + 1),
|
||||
"Kannada": range(3200, 3327 + 1),
|
||||
"Malayalam": range(3328, 3455 + 1),
|
||||
"Sinhala": range(3456, 3583 + 1),
|
||||
"Thai": range(3584, 3711 + 1),
|
||||
"Lao": range(3712, 3839 + 1),
|
||||
"Tibetan": range(3840, 4095 + 1),
|
||||
"Myanmar": range(4096, 4255 + 1),
|
||||
"Georgian": range(4256, 4351 + 1),
|
||||
"Hangul Jamo": range(4352, 4607 + 1),
|
||||
"Ethiopic": range(4608, 4991 + 1),
|
||||
"Ethiopic Supplement": range(4992, 5023 + 1),
|
||||
"Cherokee": range(5024, 5119 + 1),
|
||||
"Unified Canadian Aboriginal Syllabics": range(5120, 5759 + 1),
|
||||
"Ogham": range(5760, 5791 + 1),
|
||||
"Runic": range(5792, 5887 + 1),
|
||||
"Tagalog": range(5888, 5919 + 1),
|
||||
"Hanunoo": range(5920, 5951 + 1),
|
||||
"Buhid": range(5952, 5983 + 1),
|
||||
"Tagbanwa": range(5984, 6015 + 1),
|
||||
"Khmer": range(6016, 6143 + 1),
|
||||
"Mongolian": range(6144, 6319 + 1),
|
||||
"Unified Canadian Aboriginal Syllabics Extended": range(6320, 6399 + 1),
|
||||
"Limbu": range(6400, 6479 + 1),
|
||||
"Tai Le": range(6480, 6527 + 1),
|
||||
"New Tai Lue": range(6528, 6623 + 1),
|
||||
"Khmer Symbols": range(6624, 6655 + 1),
|
||||
"Buginese": range(6656, 6687 + 1),
|
||||
"Tai Tham": range(6688, 6831 + 1),
|
||||
"Combining Diacritical Marks Extended": range(6832, 6911 + 1),
|
||||
"Balinese": range(6912, 7039 + 1),
|
||||
"Sundanese": range(7040, 7103 + 1),
|
||||
"Batak": range(7104, 7167 + 1),
|
||||
"Lepcha": range(7168, 7247 + 1),
|
||||
"Ol Chiki": range(7248, 7295 + 1),
|
||||
"Cyrillic Extended C": range(7296, 7311 + 1),
|
||||
"Sundanese Supplement": range(7360, 7375 + 1),
|
||||
"Vedic Extensions": range(7376, 7423 + 1),
|
||||
"Phonetic Extensions": range(7424, 7551 + 1),
|
||||
"Phonetic Extensions Supplement": range(7552, 7615 + 1),
|
||||
"Combining Diacritical Marks Supplement": range(7616, 7679 + 1),
|
||||
"Latin Extended Additional": range(7680, 7935 + 1),
|
||||
"Greek Extended": range(7936, 8191 + 1),
|
||||
"General Punctuation": range(8192, 8303 + 1),
|
||||
"Superscripts and Subscripts": range(8304, 8351 + 1),
|
||||
"Currency Symbols": range(8352, 8399 + 1),
|
||||
"Combining Diacritical Marks for Symbols": range(8400, 8447 + 1),
|
||||
"Letterlike Symbols": range(8448, 8527 + 1),
|
||||
"Number Forms": range(8528, 8591 + 1),
|
||||
"Arrows": range(8592, 8703 + 1),
|
||||
"Mathematical Operators": range(8704, 8959 + 1),
|
||||
"Miscellaneous Technical": range(8960, 9215 + 1),
|
||||
"Control Pictures": range(9216, 9279 + 1),
|
||||
"Optical Character Recognition": range(9280, 9311 + 1),
|
||||
"Enclosed Alphanumerics": range(9312, 9471 + 1),
|
||||
"Box Drawing": range(9472, 9599 + 1),
|
||||
"Block Elements": range(9600, 9631 + 1),
|
||||
"Geometric Shapes": range(9632, 9727 + 1),
|
||||
"Miscellaneous Symbols": range(9728, 9983 + 1),
|
||||
"Dingbats": range(9984, 10175 + 1),
|
||||
"Miscellaneous Mathematical Symbols-A": range(10176, 10223 + 1),
|
||||
"Supplemental Arrows-A": range(10224, 10239 + 1),
|
||||
"Braille Patterns": range(10240, 10495 + 1),
|
||||
"Supplemental Arrows-B": range(10496, 10623 + 1),
|
||||
"Miscellaneous Mathematical Symbols-B": range(10624, 10751 + 1),
|
||||
"Supplemental Mathematical Operators": range(10752, 11007 + 1),
|
||||
"Miscellaneous Symbols and Arrows": range(11008, 11263 + 1),
|
||||
"Glagolitic": range(11264, 11359 + 1),
|
||||
"Latin Extended-C": range(11360, 11391 + 1),
|
||||
"Coptic": range(11392, 11519 + 1),
|
||||
"Georgian Supplement": range(11520, 11567 + 1),
|
||||
"Tifinagh": range(11568, 11647 + 1),
|
||||
"Ethiopic Extended": range(11648, 11743 + 1),
|
||||
"Cyrillic Extended-A": range(11744, 11775 + 1),
|
||||
"Supplemental Punctuation": range(11776, 11903 + 1),
|
||||
"CJK Radicals Supplement": range(11904, 12031 + 1),
|
||||
"Kangxi Radicals": range(12032, 12255 + 1),
|
||||
"Ideographic Description Characters": range(12272, 12287 + 1),
|
||||
"CJK Symbols and Punctuation": range(12288, 12351 + 1),
|
||||
"Hiragana": range(12352, 12447 + 1),
|
||||
"Katakana": range(12448, 12543 + 1),
|
||||
"Bopomofo": range(12544, 12591 + 1),
|
||||
"Hangul Compatibility Jamo": range(12592, 12687 + 1),
|
||||
"Kanbun": range(12688, 12703 + 1),
|
||||
"Bopomofo Extended": range(12704, 12735 + 1),
|
||||
"CJK Strokes": range(12736, 12783 + 1),
|
||||
"Katakana Phonetic Extensions": range(12784, 12799 + 1),
|
||||
"Enclosed CJK Letters and Months": range(12800, 13055 + 1),
|
||||
"CJK Compatibility": range(13056, 13311 + 1),
|
||||
"CJK Unified Ideographs Extension A": range(13312, 19903 + 1),
|
||||
"Yijing Hexagram Symbols": range(19904, 19967 + 1),
|
||||
"CJK Unified Ideographs": range(19968, 40959 + 1),
|
||||
"Yi Syllables": range(40960, 42127 + 1),
|
||||
"Yi Radicals": range(42128, 42191 + 1),
|
||||
"Lisu": range(42192, 42239 + 1),
|
||||
"Vai": range(42240, 42559 + 1),
|
||||
"Cyrillic Extended-B": range(42560, 42655 + 1),
|
||||
"Bamum": range(42656, 42751 + 1),
|
||||
"Modifier Tone Letters": range(42752, 42783 + 1),
|
||||
"Latin Extended-D": range(42784, 43007 + 1),
|
||||
"Syloti Nagri": range(43008, 43055 + 1),
|
||||
"Common Indic Number Forms": range(43056, 43071 + 1),
|
||||
"Phags-pa": range(43072, 43135 + 1),
|
||||
"Saurashtra": range(43136, 43231 + 1),
|
||||
"Devanagari Extended": range(43232, 43263 + 1),
|
||||
"Kayah Li": range(43264, 43311 + 1),
|
||||
"Rejang": range(43312, 43359 + 1),
|
||||
"Hangul Jamo Extended-A": range(43360, 43391 + 1),
|
||||
"Javanese": range(43392, 43487 + 1),
|
||||
"Myanmar Extended-B": range(43488, 43519 + 1),
|
||||
"Cham": range(43520, 43615 + 1),
|
||||
"Myanmar Extended-A": range(43616, 43647 + 1),
|
||||
"Tai Viet": range(43648, 43743 + 1),
|
||||
"Meetei Mayek Extensions": range(43744, 43775 + 1),
|
||||
"Ethiopic Extended-A": range(43776, 43823 + 1),
|
||||
"Latin Extended-E": range(43824, 43887 + 1),
|
||||
"Cherokee Supplement": range(43888, 43967 + 1),
|
||||
"Meetei Mayek": range(43968, 44031 + 1),
|
||||
"Hangul Syllables": range(44032, 55215 + 1),
|
||||
"Hangul Jamo Extended-B": range(55216, 55295 + 1),
|
||||
"High Surrogates": range(55296, 56191 + 1),
|
||||
"High Private Use Surrogates": range(56192, 56319 + 1),
|
||||
"Low Surrogates": range(56320, 57343 + 1),
|
||||
"Private Use Area": range(57344, 63743 + 1),
|
||||
"CJK Compatibility Ideographs": range(63744, 64255 + 1),
|
||||
"Alphabetic Presentation Forms": range(64256, 64335 + 1),
|
||||
"Arabic Presentation Forms-A": range(64336, 65023 + 1),
|
||||
"Variation Selectors": range(65024, 65039 + 1),
|
||||
"Vertical Forms": range(65040, 65055 + 1),
|
||||
"Combining Half Marks": range(65056, 65071 + 1),
|
||||
"CJK Compatibility Forms": range(65072, 65103 + 1),
|
||||
"Small Form Variants": range(65104, 65135 + 1),
|
||||
"Arabic Presentation Forms-B": range(65136, 65279 + 1),
|
||||
"Halfwidth and Fullwidth Forms": range(65280, 65519 + 1),
|
||||
"Specials": range(65520, 65535 + 1),
|
||||
"Linear B Syllabary": range(65536, 65663 + 1),
|
||||
"Linear B Ideograms": range(65664, 65791 + 1),
|
||||
"Aegean Numbers": range(65792, 65855 + 1),
|
||||
"Ancient Greek Numbers": range(65856, 65935 + 1),
|
||||
"Ancient Symbols": range(65936, 65999 + 1),
|
||||
"Phaistos Disc": range(66000, 66047 + 1),
|
||||
"Lycian": range(66176, 66207 + 1),
|
||||
"Carian": range(66208, 66271 + 1),
|
||||
"Coptic Epact Numbers": range(66272, 66303 + 1),
|
||||
"Old Italic": range(66304, 66351 + 1),
|
||||
"Gothic": range(66352, 66383 + 1),
|
||||
"Old Permic": range(66384, 66431 + 1),
|
||||
"Ugaritic": range(66432, 66463 + 1),
|
||||
"Old Persian": range(66464, 66527 + 1),
|
||||
"Deseret": range(66560, 66639 + 1),
|
||||
"Shavian": range(66640, 66687 + 1),
|
||||
"Osmanya": range(66688, 66735 + 1),
|
||||
"Osage": range(66736, 66815 + 1),
|
||||
"Elbasan": range(66816, 66863 + 1),
|
||||
"Caucasian Albanian": range(66864, 66927 + 1),
|
||||
"Linear A": range(67072, 67455 + 1),
|
||||
"Cypriot Syllabary": range(67584, 67647 + 1),
|
||||
"Imperial Aramaic": range(67648, 67679 + 1),
|
||||
"Palmyrene": range(67680, 67711 + 1),
|
||||
"Nabataean": range(67712, 67759 + 1),
|
||||
"Hatran": range(67808, 67839 + 1),
|
||||
"Phoenician": range(67840, 67871 + 1),
|
||||
"Lydian": range(67872, 67903 + 1),
|
||||
"Meroitic Hieroglyphs": range(67968, 67999 + 1),
|
||||
"Meroitic Cursive": range(68000, 68095 + 1),
|
||||
"Kharoshthi": range(68096, 68191 + 1),
|
||||
"Old South Arabian": range(68192, 68223 + 1),
|
||||
"Old North Arabian": range(68224, 68255 + 1),
|
||||
"Manichaean": range(68288, 68351 + 1),
|
||||
"Avestan": range(68352, 68415 + 1),
|
||||
"Inscriptional Parthian": range(68416, 68447 + 1),
|
||||
"Inscriptional Pahlavi": range(68448, 68479 + 1),
|
||||
"Psalter Pahlavi": range(68480, 68527 + 1),
|
||||
"Old Turkic": range(68608, 68687 + 1),
|
||||
"Old Hungarian": range(68736, 68863 + 1),
|
||||
"Rumi Numeral Symbols": range(69216, 69247 + 1),
|
||||
"Brahmi": range(69632, 69759 + 1),
|
||||
"Kaithi": range(69760, 69839 + 1),
|
||||
"Sora Sompeng": range(69840, 69887 + 1),
|
||||
"Chakma": range(69888, 69967 + 1),
|
||||
"Mahajani": range(69968, 70015 + 1),
|
||||
"Sharada": range(70016, 70111 + 1),
|
||||
"Sinhala Archaic Numbers": range(70112, 70143 + 1),
|
||||
"Khojki": range(70144, 70223 + 1),
|
||||
"Multani": range(70272, 70319 + 1),
|
||||
"Khudawadi": range(70320, 70399 + 1),
|
||||
"Grantha": range(70400, 70527 + 1),
|
||||
"Newa": range(70656, 70783 + 1),
|
||||
"Tirhuta": range(70784, 70879 + 1),
|
||||
"Siddham": range(71040, 71167 + 1),
|
||||
"Modi": range(71168, 71263 + 1),
|
||||
"Mongolian Supplement": range(71264, 71295 + 1),
|
||||
"Takri": range(71296, 71375 + 1),
|
||||
"Ahom": range(71424, 71487 + 1),
|
||||
"Warang Citi": range(71840, 71935 + 1),
|
||||
"Zanabazar Square": range(72192, 72271 + 1),
|
||||
"Soyombo": range(72272, 72367 + 1),
|
||||
"Pau Cin Hau": range(72384, 72447 + 1),
|
||||
"Bhaiksuki": range(72704, 72815 + 1),
|
||||
"Marchen": range(72816, 72895 + 1),
|
||||
"Masaram Gondi": range(72960, 73055 + 1),
|
||||
"Cuneiform": range(73728, 74751 + 1),
|
||||
"Cuneiform Numbers and Punctuation": range(74752, 74879 + 1),
|
||||
"Early Dynastic Cuneiform": range(74880, 75087 + 1),
|
||||
"Egyptian Hieroglyphs": range(77824, 78895 + 1),
|
||||
"Anatolian Hieroglyphs": range(82944, 83583 + 1),
|
||||
"Bamum Supplement": range(92160, 92735 + 1),
|
||||
"Mro": range(92736, 92783 + 1),
|
||||
"Bassa Vah": range(92880, 92927 + 1),
|
||||
"Pahawh Hmong": range(92928, 93071 + 1),
|
||||
"Miao": range(93952, 94111 + 1),
|
||||
"Ideographic Symbols and Punctuation": range(94176, 94207 + 1),
|
||||
"Tangut": range(94208, 100351 + 1),
|
||||
"Tangut Components": range(100352, 101119 + 1),
|
||||
"Kana Supplement": range(110592, 110847 + 1),
|
||||
"Kana Extended-A": range(110848, 110895 + 1),
|
||||
"Nushu": range(110960, 111359 + 1),
|
||||
"Duployan": range(113664, 113823 + 1),
|
||||
"Shorthand Format Controls": range(113824, 113839 + 1),
|
||||
"Byzantine Musical Symbols": range(118784, 119039 + 1),
|
||||
"Musical Symbols": range(119040, 119295 + 1),
|
||||
"Ancient Greek Musical Notation": range(119296, 119375 + 1),
|
||||
"Tai Xuan Jing Symbols": range(119552, 119647 + 1),
|
||||
"Counting Rod Numerals": range(119648, 119679 + 1),
|
||||
"Mathematical Alphanumeric Symbols": range(119808, 120831 + 1),
|
||||
"Sutton SignWriting": range(120832, 121519 + 1),
|
||||
"Glagolitic Supplement": range(122880, 122927 + 1),
|
||||
"Mende Kikakui": range(124928, 125151 + 1),
|
||||
"Adlam": range(125184, 125279 + 1),
|
||||
"Arabic Mathematical Alphabetic Symbols": range(126464, 126719 + 1),
|
||||
"Mahjong Tiles": range(126976, 127023 + 1),
|
||||
"Domino Tiles": range(127024, 127135 + 1),
|
||||
"Playing Cards": range(127136, 127231 + 1),
|
||||
"Enclosed Alphanumeric Supplement": range(127232, 127487 + 1),
|
||||
"Enclosed Ideographic Supplement": range(127488, 127743 + 1),
|
||||
"Miscellaneous Symbols and Pictographs": range(127744, 128511 + 1),
|
||||
"Emoticons range(Emoji)": range(128512, 128591 + 1),
|
||||
"Ornamental Dingbats": range(128592, 128639 + 1),
|
||||
"Transport and Map Symbols": range(128640, 128767 + 1),
|
||||
"Alchemical Symbols": range(128768, 128895 + 1),
|
||||
"Geometric Shapes Extended": range(128896, 129023 + 1),
|
||||
"Supplemental Arrows-C": range(129024, 129279 + 1),
|
||||
"Supplemental Symbols and Pictographs": range(129280, 129535 + 1),
|
||||
"CJK Unified Ideographs Extension B": range(131072, 173791 + 1),
|
||||
"CJK Unified Ideographs Extension C": range(173824, 177983 + 1),
|
||||
"CJK Unified Ideographs Extension D": range(177984, 178207 + 1),
|
||||
"CJK Unified Ideographs Extension E": range(178208, 183983 + 1),
|
||||
"CJK Unified Ideographs Extension F": range(183984, 191471 + 1),
|
||||
"CJK Compatibility Ideographs Supplement": range(194560, 195103 + 1),
|
||||
"Tags": range(917504, 917631 + 1),
|
||||
"Variation Selectors Supplement": range(917760, 917999 + 1),
|
||||
}
|
||||
|
||||
|
||||
UNICODE_SECONDARY_RANGE_KEYWORD: List[str] = [
|
||||
"Supplement",
|
||||
"Extended",
|
||||
"Extensions",
|
||||
"Modifier",
|
||||
"Marks",
|
||||
"Punctuation",
|
||||
"Symbols",
|
||||
"Forms",
|
||||
"Operators",
|
||||
"Miscellaneous",
|
||||
"Drawing",
|
||||
"Block",
|
||||
"Shapes",
|
||||
"Supplemental",
|
||||
"Tags",
|
||||
]
|
||||
|
||||
RE_POSSIBLE_ENCODING_INDICATION = re_compile(
|
||||
r"(?:(?:encoding)|(?:charset)|(?:coding))(?:[\:= ]{1,10})(?:[\"\']?)([a-zA-Z0-9\-_]+)(?:[\"\']?)",
|
||||
IGNORECASE,
|
||||
)
|
||||
|
||||
IANA_SUPPORTED: List[str] = sorted(
|
||||
filter(
|
||||
lambda x: x.endswith("_codec") is False
|
||||
and x not in {"rot_13", "tactis", "mbcs"},
|
||||
list(set(aliases.values())),
|
||||
)
|
||||
)
|
||||
|
||||
IANA_SUPPORTED_COUNT: int = len(IANA_SUPPORTED)
|
||||
|
||||
# pre-computed code page that are similar using the function cp_similarity.
|
||||
IANA_SUPPORTED_SIMILAR: Dict[str, List[str]] = {
|
||||
"cp037": ["cp1026", "cp1140", "cp273", "cp500"],
|
||||
"cp1026": ["cp037", "cp1140", "cp273", "cp500"],
|
||||
"cp1125": ["cp866"],
|
||||
"cp1140": ["cp037", "cp1026", "cp273", "cp500"],
|
||||
"cp1250": ["iso8859_2"],
|
||||
"cp1251": ["kz1048", "ptcp154"],
|
||||
"cp1252": ["iso8859_15", "iso8859_9", "latin_1"],
|
||||
"cp1253": ["iso8859_7"],
|
||||
"cp1254": ["iso8859_15", "iso8859_9", "latin_1"],
|
||||
"cp1257": ["iso8859_13"],
|
||||
"cp273": ["cp037", "cp1026", "cp1140", "cp500"],
|
||||
"cp437": ["cp850", "cp858", "cp860", "cp861", "cp862", "cp863", "cp865"],
|
||||
"cp500": ["cp037", "cp1026", "cp1140", "cp273"],
|
||||
"cp850": ["cp437", "cp857", "cp858", "cp865"],
|
||||
"cp857": ["cp850", "cp858", "cp865"],
|
||||
"cp858": ["cp437", "cp850", "cp857", "cp865"],
|
||||
"cp860": ["cp437", "cp861", "cp862", "cp863", "cp865"],
|
||||
"cp861": ["cp437", "cp860", "cp862", "cp863", "cp865"],
|
||||
"cp862": ["cp437", "cp860", "cp861", "cp863", "cp865"],
|
||||
"cp863": ["cp437", "cp860", "cp861", "cp862", "cp865"],
|
||||
"cp865": ["cp437", "cp850", "cp857", "cp858", "cp860", "cp861", "cp862", "cp863"],
|
||||
"cp866": ["cp1125"],
|
||||
"iso8859_10": ["iso8859_14", "iso8859_15", "iso8859_4", "iso8859_9", "latin_1"],
|
||||
"iso8859_11": ["tis_620"],
|
||||
"iso8859_13": ["cp1257"],
|
||||
"iso8859_14": [
|
||||
"iso8859_10",
|
||||
"iso8859_15",
|
||||
"iso8859_16",
|
||||
"iso8859_3",
|
||||
"iso8859_9",
|
||||
"latin_1",
|
||||
],
|
||||
"iso8859_15": [
|
||||
"cp1252",
|
||||
"cp1254",
|
||||
"iso8859_10",
|
||||
"iso8859_14",
|
||||
"iso8859_16",
|
||||
"iso8859_3",
|
||||
"iso8859_9",
|
||||
"latin_1",
|
||||
],
|
||||
"iso8859_16": [
|
||||
"iso8859_14",
|
||||
"iso8859_15",
|
||||
"iso8859_2",
|
||||
"iso8859_3",
|
||||
"iso8859_9",
|
||||
"latin_1",
|
||||
],
|
||||
"iso8859_2": ["cp1250", "iso8859_16", "iso8859_4"],
|
||||
"iso8859_3": ["iso8859_14", "iso8859_15", "iso8859_16", "iso8859_9", "latin_1"],
|
||||
"iso8859_4": ["iso8859_10", "iso8859_2", "iso8859_9", "latin_1"],
|
||||
"iso8859_7": ["cp1253"],
|
||||
"iso8859_9": [
|
||||
"cp1252",
|
||||
"cp1254",
|
||||
"cp1258",
|
||||
"iso8859_10",
|
||||
"iso8859_14",
|
||||
"iso8859_15",
|
||||
"iso8859_16",
|
||||
"iso8859_3",
|
||||
"iso8859_4",
|
||||
"latin_1",
|
||||
],
|
||||
"kz1048": ["cp1251", "ptcp154"],
|
||||
"latin_1": [
|
||||
"cp1252",
|
||||
"cp1254",
|
||||
"cp1258",
|
||||
"iso8859_10",
|
||||
"iso8859_14",
|
||||
"iso8859_15",
|
||||
"iso8859_16",
|
||||
"iso8859_3",
|
||||
"iso8859_4",
|
||||
"iso8859_9",
|
||||
],
|
||||
"mac_iceland": ["mac_roman", "mac_turkish"],
|
||||
"mac_roman": ["mac_iceland", "mac_turkish"],
|
||||
"mac_turkish": ["mac_iceland", "mac_roman"],
|
||||
"ptcp154": ["cp1251", "kz1048"],
|
||||
"tis_620": ["iso8859_11"],
|
||||
}
|
||||
|
||||
|
||||
CHARDET_CORRESPONDENCE: Dict[str, str] = {
|
||||
"iso2022_kr": "ISO-2022-KR",
|
||||
"iso2022_jp": "ISO-2022-JP",
|
||||
"euc_kr": "EUC-KR",
|
||||
"tis_620": "TIS-620",
|
||||
"utf_32": "UTF-32",
|
||||
"euc_jp": "EUC-JP",
|
||||
"koi8_r": "KOI8-R",
|
||||
"iso8859_1": "ISO-8859-1",
|
||||
"iso8859_2": "ISO-8859-2",
|
||||
"iso8859_5": "ISO-8859-5",
|
||||
"iso8859_6": "ISO-8859-6",
|
||||
"iso8859_7": "ISO-8859-7",
|
||||
"iso8859_8": "ISO-8859-8",
|
||||
"utf_16": "UTF-16",
|
||||
"cp855": "IBM855",
|
||||
"mac_cyrillic": "MacCyrillic",
|
||||
"gb2312": "GB2312",
|
||||
"gb18030": "GB18030",
|
||||
"cp932": "CP932",
|
||||
"cp866": "IBM866",
|
||||
"utf_8": "utf-8",
|
||||
"utf_8_sig": "UTF-8-SIG",
|
||||
"shift_jis": "SHIFT_JIS",
|
||||
"big5": "Big5",
|
||||
"cp1250": "windows-1250",
|
||||
"cp1251": "windows-1251",
|
||||
"cp1252": "Windows-1252",
|
||||
"cp1253": "windows-1253",
|
||||
"cp1255": "windows-1255",
|
||||
"cp1256": "windows-1256",
|
||||
"cp1254": "Windows-1254",
|
||||
"cp949": "CP949",
|
||||
}
|
||||
|
||||
|
||||
COMMON_SAFE_ASCII_CHARACTERS: Set[str] = {
|
||||
"<",
|
||||
">",
|
||||
"=",
|
||||
":",
|
||||
"/",
|
||||
"&",
|
||||
";",
|
||||
"{",
|
||||
"}",
|
||||
"[",
|
||||
"]",
|
||||
",",
|
||||
"|",
|
||||
'"',
|
||||
"-",
|
||||
}
|
||||
|
||||
|
||||
KO_NAMES: Set[str] = {"johab", "cp949", "euc_kr"}
|
||||
ZH_NAMES: Set[str] = {"big5", "cp950", "big5hkscs", "hz"}
|
||||
|
||||
NOT_PRINTABLE_PATTERN = re_compile(r"[0-9\W\n\r\t]+")
|
||||
|
||||
LANGUAGE_SUPPORTED_COUNT: int = len(FREQUENCIES)
|
||||
|
||||
# Logging LEVEL bellow DEBUG
|
||||
TRACE: int = 5
|
||||
@@ -0,0 +1,95 @@
|
||||
import warnings
|
||||
from typing import Dict, Optional, Union
|
||||
|
||||
from .api import from_bytes, from_fp, from_path, normalize
|
||||
from .constant import CHARDET_CORRESPONDENCE
|
||||
from .models import CharsetMatch, CharsetMatches
|
||||
|
||||
|
||||
def detect(byte_str: bytes) -> Dict[str, Optional[Union[str, float]]]:
|
||||
"""
|
||||
chardet legacy method
|
||||
Detect the encoding of the given byte string. It should be mostly backward-compatible.
|
||||
Encoding name will match Chardet own writing whenever possible. (Not on encoding name unsupported by it)
|
||||
This function is deprecated and should be used to migrate your project easily, consult the documentation for
|
||||
further information. Not planned for removal.
|
||||
|
||||
:param byte_str: The byte sequence to examine.
|
||||
"""
|
||||
if not isinstance(byte_str, (bytearray, bytes)):
|
||||
raise TypeError( # pragma: nocover
|
||||
"Expected object of type bytes or bytearray, got: "
|
||||
"{0}".format(type(byte_str))
|
||||
)
|
||||
|
||||
if isinstance(byte_str, bytearray):
|
||||
byte_str = bytes(byte_str)
|
||||
|
||||
r = from_bytes(byte_str).best()
|
||||
|
||||
encoding = r.encoding if r is not None else None
|
||||
language = r.language if r is not None and r.language != "Unknown" else ""
|
||||
confidence = 1.0 - r.chaos if r is not None else None
|
||||
|
||||
# Note: CharsetNormalizer does not return 'UTF-8-SIG' as the sig get stripped in the detection/normalization process
|
||||
# but chardet does return 'utf-8-sig' and it is a valid codec name.
|
||||
if r is not None and encoding == "utf_8" and r.bom:
|
||||
encoding += "_sig"
|
||||
|
||||
return {
|
||||
"encoding": encoding
|
||||
if encoding not in CHARDET_CORRESPONDENCE
|
||||
else CHARDET_CORRESPONDENCE[encoding],
|
||||
"language": language,
|
||||
"confidence": confidence,
|
||||
}
|
||||
|
||||
|
||||
class CharsetNormalizerMatch(CharsetMatch):
|
||||
pass
|
||||
|
||||
|
||||
class CharsetNormalizerMatches(CharsetMatches):
|
||||
@staticmethod
|
||||
def from_fp(*args, **kwargs): # type: ignore
|
||||
warnings.warn( # pragma: nocover
|
||||
"staticmethod from_fp, from_bytes, from_path and normalize are deprecated "
|
||||
"and scheduled to be removed in 3.0",
|
||||
DeprecationWarning,
|
||||
)
|
||||
return from_fp(*args, **kwargs) # pragma: nocover
|
||||
|
||||
@staticmethod
|
||||
def from_bytes(*args, **kwargs): # type: ignore
|
||||
warnings.warn( # pragma: nocover
|
||||
"staticmethod from_fp, from_bytes, from_path and normalize are deprecated "
|
||||
"and scheduled to be removed in 3.0",
|
||||
DeprecationWarning,
|
||||
)
|
||||
return from_bytes(*args, **kwargs) # pragma: nocover
|
||||
|
||||
@staticmethod
|
||||
def from_path(*args, **kwargs): # type: ignore
|
||||
warnings.warn( # pragma: nocover
|
||||
"staticmethod from_fp, from_bytes, from_path and normalize are deprecated "
|
||||
"and scheduled to be removed in 3.0",
|
||||
DeprecationWarning,
|
||||
)
|
||||
return from_path(*args, **kwargs) # pragma: nocover
|
||||
|
||||
@staticmethod
|
||||
def normalize(*args, **kwargs): # type: ignore
|
||||
warnings.warn( # pragma: nocover
|
||||
"staticmethod from_fp, from_bytes, from_path and normalize are deprecated "
|
||||
"and scheduled to be removed in 3.0",
|
||||
DeprecationWarning,
|
||||
)
|
||||
return normalize(*args, **kwargs) # pragma: nocover
|
||||
|
||||
|
||||
class CharsetDetector(CharsetNormalizerMatches):
|
||||
pass
|
||||
|
||||
|
||||
class CharsetDoctor(CharsetNormalizerMatches):
|
||||
pass
|
||||
@@ -0,0 +1,553 @@
|
||||
from functools import lru_cache
|
||||
from typing import List, Optional
|
||||
|
||||
from .constant import COMMON_SAFE_ASCII_CHARACTERS, UNICODE_SECONDARY_RANGE_KEYWORD
|
||||
from .utils import (
|
||||
is_accentuated,
|
||||
is_ascii,
|
||||
is_case_variable,
|
||||
is_cjk,
|
||||
is_emoticon,
|
||||
is_hangul,
|
||||
is_hiragana,
|
||||
is_katakana,
|
||||
is_latin,
|
||||
is_punctuation,
|
||||
is_separator,
|
||||
is_symbol,
|
||||
is_thai,
|
||||
is_unprintable,
|
||||
remove_accent,
|
||||
unicode_range,
|
||||
)
|
||||
|
||||
|
||||
class MessDetectorPlugin:
|
||||
"""
|
||||
Base abstract class used for mess detection plugins.
|
||||
All detectors MUST extend and implement given methods.
|
||||
"""
|
||||
|
||||
def eligible(self, character: str) -> bool:
|
||||
"""
|
||||
Determine if given character should be fed in.
|
||||
"""
|
||||
raise NotImplementedError # pragma: nocover
|
||||
|
||||
def feed(self, character: str) -> None:
|
||||
"""
|
||||
The main routine to be executed upon character.
|
||||
Insert the logic in witch the text would be considered chaotic.
|
||||
"""
|
||||
raise NotImplementedError # pragma: nocover
|
||||
|
||||
def reset(self) -> None: # pragma: no cover
|
||||
"""
|
||||
Permit to reset the plugin to the initial state.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@property
|
||||
def ratio(self) -> float:
|
||||
"""
|
||||
Compute the chaos ratio based on what your feed() has seen.
|
||||
Must NOT be lower than 0.; No restriction gt 0.
|
||||
"""
|
||||
raise NotImplementedError # pragma: nocover
|
||||
|
||||
|
||||
class TooManySymbolOrPunctuationPlugin(MessDetectorPlugin):
|
||||
def __init__(self) -> None:
|
||||
self._punctuation_count: int = 0
|
||||
self._symbol_count: int = 0
|
||||
self._character_count: int = 0
|
||||
|
||||
self._last_printable_char: Optional[str] = None
|
||||
self._frenzy_symbol_in_word: bool = False
|
||||
|
||||
def eligible(self, character: str) -> bool:
|
||||
return character.isprintable()
|
||||
|
||||
def feed(self, character: str) -> None:
|
||||
self._character_count += 1
|
||||
|
||||
if (
|
||||
character != self._last_printable_char
|
||||
and character not in COMMON_SAFE_ASCII_CHARACTERS
|
||||
):
|
||||
if is_punctuation(character):
|
||||
self._punctuation_count += 1
|
||||
elif (
|
||||
character.isdigit() is False
|
||||
and is_symbol(character)
|
||||
and is_emoticon(character) is False
|
||||
):
|
||||
self._symbol_count += 2
|
||||
|
||||
self._last_printable_char = character
|
||||
|
||||
def reset(self) -> None: # pragma: no cover
|
||||
self._punctuation_count = 0
|
||||
self._character_count = 0
|
||||
self._symbol_count = 0
|
||||
|
||||
@property
|
||||
def ratio(self) -> float:
|
||||
if self._character_count == 0:
|
||||
return 0.0
|
||||
|
||||
ratio_of_punctuation: float = (
|
||||
self._punctuation_count + self._symbol_count
|
||||
) / self._character_count
|
||||
|
||||
return ratio_of_punctuation if ratio_of_punctuation >= 0.3 else 0.0
|
||||
|
||||
|
||||
class TooManyAccentuatedPlugin(MessDetectorPlugin):
|
||||
def __init__(self) -> None:
|
||||
self._character_count: int = 0
|
||||
self._accentuated_count: int = 0
|
||||
|
||||
def eligible(self, character: str) -> bool:
|
||||
return character.isalpha()
|
||||
|
||||
def feed(self, character: str) -> None:
|
||||
self._character_count += 1
|
||||
|
||||
if is_accentuated(character):
|
||||
self._accentuated_count += 1
|
||||
|
||||
def reset(self) -> None: # pragma: no cover
|
||||
self._character_count = 0
|
||||
self._accentuated_count = 0
|
||||
|
||||
@property
|
||||
def ratio(self) -> float:
|
||||
if self._character_count == 0:
|
||||
return 0.0
|
||||
ratio_of_accentuation: float = self._accentuated_count / self._character_count
|
||||
return ratio_of_accentuation if ratio_of_accentuation >= 0.35 else 0.0
|
||||
|
||||
|
||||
class UnprintablePlugin(MessDetectorPlugin):
|
||||
def __init__(self) -> None:
|
||||
self._unprintable_count: int = 0
|
||||
self._character_count: int = 0
|
||||
|
||||
def eligible(self, character: str) -> bool:
|
||||
return True
|
||||
|
||||
def feed(self, character: str) -> None:
|
||||
if is_unprintable(character):
|
||||
self._unprintable_count += 1
|
||||
self._character_count += 1
|
||||
|
||||
def reset(self) -> None: # pragma: no cover
|
||||
self._unprintable_count = 0
|
||||
|
||||
@property
|
||||
def ratio(self) -> float:
|
||||
if self._character_count == 0:
|
||||
return 0.0
|
||||
|
||||
return (self._unprintable_count * 8) / self._character_count
|
||||
|
||||
|
||||
class SuspiciousDuplicateAccentPlugin(MessDetectorPlugin):
|
||||
def __init__(self) -> None:
|
||||
self._successive_count: int = 0
|
||||
self._character_count: int = 0
|
||||
|
||||
self._last_latin_character: Optional[str] = None
|
||||
|
||||
def eligible(self, character: str) -> bool:
|
||||
return character.isalpha() and is_latin(character)
|
||||
|
||||
def feed(self, character: str) -> None:
|
||||
self._character_count += 1
|
||||
if (
|
||||
self._last_latin_character is not None
|
||||
and is_accentuated(character)
|
||||
and is_accentuated(self._last_latin_character)
|
||||
):
|
||||
if character.isupper() and self._last_latin_character.isupper():
|
||||
self._successive_count += 1
|
||||
# Worse if its the same char duplicated with different accent.
|
||||
if remove_accent(character) == remove_accent(self._last_latin_character):
|
||||
self._successive_count += 1
|
||||
self._last_latin_character = character
|
||||
|
||||
def reset(self) -> None: # pragma: no cover
|
||||
self._successive_count = 0
|
||||
self._character_count = 0
|
||||
self._last_latin_character = None
|
||||
|
||||
@property
|
||||
def ratio(self) -> float:
|
||||
if self._character_count == 0:
|
||||
return 0.0
|
||||
|
||||
return (self._successive_count * 2) / self._character_count
|
||||
|
||||
|
||||
class SuspiciousRange(MessDetectorPlugin):
|
||||
def __init__(self) -> None:
|
||||
self._suspicious_successive_range_count: int = 0
|
||||
self._character_count: int = 0
|
||||
self._last_printable_seen: Optional[str] = None
|
||||
|
||||
def eligible(self, character: str) -> bool:
|
||||
return character.isprintable()
|
||||
|
||||
def feed(self, character: str) -> None:
|
||||
self._character_count += 1
|
||||
|
||||
if (
|
||||
character.isspace()
|
||||
or is_punctuation(character)
|
||||
or character in COMMON_SAFE_ASCII_CHARACTERS
|
||||
):
|
||||
self._last_printable_seen = None
|
||||
return
|
||||
|
||||
if self._last_printable_seen is None:
|
||||
self._last_printable_seen = character
|
||||
return
|
||||
|
||||
unicode_range_a: Optional[str] = unicode_range(self._last_printable_seen)
|
||||
unicode_range_b: Optional[str] = unicode_range(character)
|
||||
|
||||
if is_suspiciously_successive_range(unicode_range_a, unicode_range_b):
|
||||
self._suspicious_successive_range_count += 1
|
||||
|
||||
self._last_printable_seen = character
|
||||
|
||||
def reset(self) -> None: # pragma: no cover
|
||||
self._character_count = 0
|
||||
self._suspicious_successive_range_count = 0
|
||||
self._last_printable_seen = None
|
||||
|
||||
@property
|
||||
def ratio(self) -> float:
|
||||
if self._character_count == 0:
|
||||
return 0.0
|
||||
|
||||
ratio_of_suspicious_range_usage: float = (
|
||||
self._suspicious_successive_range_count * 2
|
||||
) / self._character_count
|
||||
|
||||
if ratio_of_suspicious_range_usage < 0.1:
|
||||
return 0.0
|
||||
|
||||
return ratio_of_suspicious_range_usage
|
||||
|
||||
|
||||
class SuperWeirdWordPlugin(MessDetectorPlugin):
|
||||
def __init__(self) -> None:
|
||||
self._word_count: int = 0
|
||||
self._bad_word_count: int = 0
|
||||
self._foreign_long_count: int = 0
|
||||
|
||||
self._is_current_word_bad: bool = False
|
||||
self._foreign_long_watch: bool = False
|
||||
|
||||
self._character_count: int = 0
|
||||
self._bad_character_count: int = 0
|
||||
|
||||
self._buffer: str = ""
|
||||
self._buffer_accent_count: int = 0
|
||||
|
||||
def eligible(self, character: str) -> bool:
|
||||
return True
|
||||
|
||||
def feed(self, character: str) -> None:
|
||||
if character.isalpha():
|
||||
self._buffer += character
|
||||
if is_accentuated(character):
|
||||
self._buffer_accent_count += 1
|
||||
if (
|
||||
self._foreign_long_watch is False
|
||||
and (is_latin(character) is False or is_accentuated(character))
|
||||
and is_cjk(character) is False
|
||||
and is_hangul(character) is False
|
||||
and is_katakana(character) is False
|
||||
and is_hiragana(character) is False
|
||||
and is_thai(character) is False
|
||||
):
|
||||
self._foreign_long_watch = True
|
||||
return
|
||||
if not self._buffer:
|
||||
return
|
||||
if (
|
||||
character.isspace() or is_punctuation(character) or is_separator(character)
|
||||
) and self._buffer:
|
||||
self._word_count += 1
|
||||
buffer_length: int = len(self._buffer)
|
||||
|
||||
self._character_count += buffer_length
|
||||
|
||||
if buffer_length >= 4:
|
||||
if self._buffer_accent_count / buffer_length > 0.34:
|
||||
self._is_current_word_bad = True
|
||||
# Word/Buffer ending with a upper case accentuated letter are so rare,
|
||||
# that we will consider them all as suspicious. Same weight as foreign_long suspicious.
|
||||
if is_accentuated(self._buffer[-1]) and self._buffer[-1].isupper():
|
||||
self._foreign_long_count += 1
|
||||
self._is_current_word_bad = True
|
||||
if buffer_length >= 24 and self._foreign_long_watch:
|
||||
self._foreign_long_count += 1
|
||||
self._is_current_word_bad = True
|
||||
|
||||
if self._is_current_word_bad:
|
||||
self._bad_word_count += 1
|
||||
self._bad_character_count += len(self._buffer)
|
||||
self._is_current_word_bad = False
|
||||
|
||||
self._foreign_long_watch = False
|
||||
self._buffer = ""
|
||||
self._buffer_accent_count = 0
|
||||
elif (
|
||||
character not in {"<", ">", "-", "=", "~", "|", "_"}
|
||||
and character.isdigit() is False
|
||||
and is_symbol(character)
|
||||
):
|
||||
self._is_current_word_bad = True
|
||||
self._buffer += character
|
||||
|
||||
def reset(self) -> None: # pragma: no cover
|
||||
self._buffer = ""
|
||||
self._is_current_word_bad = False
|
||||
self._foreign_long_watch = False
|
||||
self._bad_word_count = 0
|
||||
self._word_count = 0
|
||||
self._character_count = 0
|
||||
self._bad_character_count = 0
|
||||
self._foreign_long_count = 0
|
||||
|
||||
@property
|
||||
def ratio(self) -> float:
|
||||
if self._word_count <= 10 and self._foreign_long_count == 0:
|
||||
return 0.0
|
||||
|
||||
return self._bad_character_count / self._character_count
|
||||
|
||||
|
||||
class CjkInvalidStopPlugin(MessDetectorPlugin):
|
||||
"""
|
||||
GB(Chinese) based encoding often render the stop incorrectly when the content does not fit and
|
||||
can be easily detected. Searching for the overuse of '丅' and '丄'.
|
||||
"""
|
||||
|
||||
def __init__(self) -> None:
|
||||
self._wrong_stop_count: int = 0
|
||||
self._cjk_character_count: int = 0
|
||||
|
||||
def eligible(self, character: str) -> bool:
|
||||
return True
|
||||
|
||||
def feed(self, character: str) -> None:
|
||||
if character in {"丅", "丄"}:
|
||||
self._wrong_stop_count += 1
|
||||
return
|
||||
if is_cjk(character):
|
||||
self._cjk_character_count += 1
|
||||
|
||||
def reset(self) -> None: # pragma: no cover
|
||||
self._wrong_stop_count = 0
|
||||
self._cjk_character_count = 0
|
||||
|
||||
@property
|
||||
def ratio(self) -> float:
|
||||
if self._cjk_character_count < 16:
|
||||
return 0.0
|
||||
return self._wrong_stop_count / self._cjk_character_count
|
||||
|
||||
|
||||
class ArchaicUpperLowerPlugin(MessDetectorPlugin):
|
||||
def __init__(self) -> None:
|
||||
self._buf: bool = False
|
||||
|
||||
self._character_count_since_last_sep: int = 0
|
||||
|
||||
self._successive_upper_lower_count: int = 0
|
||||
self._successive_upper_lower_count_final: int = 0
|
||||
|
||||
self._character_count: int = 0
|
||||
|
||||
self._last_alpha_seen: Optional[str] = None
|
||||
self._current_ascii_only: bool = True
|
||||
|
||||
def eligible(self, character: str) -> bool:
|
||||
return True
|
||||
|
||||
def feed(self, character: str) -> None:
|
||||
is_concerned = character.isalpha() and is_case_variable(character)
|
||||
chunk_sep = is_concerned is False
|
||||
|
||||
if chunk_sep and self._character_count_since_last_sep > 0:
|
||||
if (
|
||||
self._character_count_since_last_sep <= 64
|
||||
and character.isdigit() is False
|
||||
and self._current_ascii_only is False
|
||||
):
|
||||
self._successive_upper_lower_count_final += (
|
||||
self._successive_upper_lower_count
|
||||
)
|
||||
|
||||
self._successive_upper_lower_count = 0
|
||||
self._character_count_since_last_sep = 0
|
||||
self._last_alpha_seen = None
|
||||
self._buf = False
|
||||
self._character_count += 1
|
||||
self._current_ascii_only = True
|
||||
|
||||
return
|
||||
|
||||
if self._current_ascii_only is True and is_ascii(character) is False:
|
||||
self._current_ascii_only = False
|
||||
|
||||
if self._last_alpha_seen is not None:
|
||||
if (character.isupper() and self._last_alpha_seen.islower()) or (
|
||||
character.islower() and self._last_alpha_seen.isupper()
|
||||
):
|
||||
if self._buf is True:
|
||||
self._successive_upper_lower_count += 2
|
||||
self._buf = False
|
||||
else:
|
||||
self._buf = True
|
||||
else:
|
||||
self._buf = False
|
||||
|
||||
self._character_count += 1
|
||||
self._character_count_since_last_sep += 1
|
||||
self._last_alpha_seen = character
|
||||
|
||||
def reset(self) -> None: # pragma: no cover
|
||||
self._character_count = 0
|
||||
self._character_count_since_last_sep = 0
|
||||
self._successive_upper_lower_count = 0
|
||||
self._successive_upper_lower_count_final = 0
|
||||
self._last_alpha_seen = None
|
||||
self._buf = False
|
||||
self._current_ascii_only = True
|
||||
|
||||
@property
|
||||
def ratio(self) -> float:
|
||||
if self._character_count == 0:
|
||||
return 0.0
|
||||
|
||||
return self._successive_upper_lower_count_final / self._character_count
|
||||
|
||||
|
||||
@lru_cache(maxsize=1024)
|
||||
def is_suspiciously_successive_range(
|
||||
unicode_range_a: Optional[str], unicode_range_b: Optional[str]
|
||||
) -> bool:
|
||||
"""
|
||||
Determine if two Unicode range seen next to each other can be considered as suspicious.
|
||||
"""
|
||||
if unicode_range_a is None or unicode_range_b is None:
|
||||
return True
|
||||
|
||||
if unicode_range_a == unicode_range_b:
|
||||
return False
|
||||
|
||||
if "Latin" in unicode_range_a and "Latin" in unicode_range_b:
|
||||
return False
|
||||
|
||||
if "Emoticons" in unicode_range_a or "Emoticons" in unicode_range_b:
|
||||
return False
|
||||
|
||||
# Latin characters can be accompanied with a combining diacritical mark
|
||||
# eg. Vietnamese.
|
||||
if ("Latin" in unicode_range_a or "Latin" in unicode_range_b) and (
|
||||
"Combining" in unicode_range_a or "Combining" in unicode_range_b
|
||||
):
|
||||
return False
|
||||
|
||||
keywords_range_a, keywords_range_b = unicode_range_a.split(
|
||||
" "
|
||||
), unicode_range_b.split(" ")
|
||||
|
||||
for el in keywords_range_a:
|
||||
if el in UNICODE_SECONDARY_RANGE_KEYWORD:
|
||||
continue
|
||||
if el in keywords_range_b:
|
||||
return False
|
||||
|
||||
# Japanese Exception
|
||||
range_a_jp_chars, range_b_jp_chars = (
|
||||
unicode_range_a
|
||||
in (
|
||||
"Hiragana",
|
||||
"Katakana",
|
||||
),
|
||||
unicode_range_b in ("Hiragana", "Katakana"),
|
||||
)
|
||||
if (range_a_jp_chars or range_b_jp_chars) and (
|
||||
"CJK" in unicode_range_a or "CJK" in unicode_range_b
|
||||
):
|
||||
return False
|
||||
if range_a_jp_chars and range_b_jp_chars:
|
||||
return False
|
||||
|
||||
if "Hangul" in unicode_range_a or "Hangul" in unicode_range_b:
|
||||
if "CJK" in unicode_range_a or "CJK" in unicode_range_b:
|
||||
return False
|
||||
if unicode_range_a == "Basic Latin" or unicode_range_b == "Basic Latin":
|
||||
return False
|
||||
|
||||
# Chinese/Japanese use dedicated range for punctuation and/or separators.
|
||||
if ("CJK" in unicode_range_a or "CJK" in unicode_range_b) or (
|
||||
unicode_range_a in ["Katakana", "Hiragana"]
|
||||
and unicode_range_b in ["Katakana", "Hiragana"]
|
||||
):
|
||||
if "Punctuation" in unicode_range_a or "Punctuation" in unicode_range_b:
|
||||
return False
|
||||
if "Forms" in unicode_range_a or "Forms" in unicode_range_b:
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
|
||||
@lru_cache(maxsize=2048)
|
||||
def mess_ratio(
|
||||
decoded_sequence: str, maximum_threshold: float = 0.2, debug: bool = False
|
||||
) -> float:
|
||||
"""
|
||||
Compute a mess ratio given a decoded bytes sequence. The maximum threshold does stop the computation earlier.
|
||||
"""
|
||||
|
||||
detectors: List[MessDetectorPlugin] = [
|
||||
md_class() for md_class in MessDetectorPlugin.__subclasses__()
|
||||
]
|
||||
|
||||
length: int = len(decoded_sequence) + 1
|
||||
|
||||
mean_mess_ratio: float = 0.0
|
||||
|
||||
if length < 512:
|
||||
intermediary_mean_mess_ratio_calc: int = 32
|
||||
elif length <= 1024:
|
||||
intermediary_mean_mess_ratio_calc = 64
|
||||
else:
|
||||
intermediary_mean_mess_ratio_calc = 128
|
||||
|
||||
for character, index in zip(decoded_sequence + "\n", range(length)):
|
||||
for detector in detectors:
|
||||
if detector.eligible(character):
|
||||
detector.feed(character)
|
||||
|
||||
if (
|
||||
index > 0 and index % intermediary_mean_mess_ratio_calc == 0
|
||||
) or index == length - 1:
|
||||
mean_mess_ratio = sum(dt.ratio for dt in detectors)
|
||||
|
||||
if mean_mess_ratio >= maximum_threshold:
|
||||
break
|
||||
|
||||
if debug:
|
||||
for dt in detectors: # pragma: nocover
|
||||
print(dt.__class__, dt.ratio)
|
||||
|
||||
return round(mean_mess_ratio, 3)
|
||||
@@ -0,0 +1,401 @@
|
||||
import warnings
|
||||
from collections import Counter
|
||||
from encodings.aliases import aliases
|
||||
from hashlib import sha256
|
||||
from json import dumps
|
||||
from re import sub
|
||||
from typing import (
|
||||
Any,
|
||||
Counter as TypeCounter,
|
||||
Dict,
|
||||
Iterator,
|
||||
List,
|
||||
Optional,
|
||||
Tuple,
|
||||
Union,
|
||||
)
|
||||
|
||||
from .constant import NOT_PRINTABLE_PATTERN, TOO_BIG_SEQUENCE
|
||||
from .md import mess_ratio
|
||||
from .utils import iana_name, is_multi_byte_encoding, unicode_range
|
||||
|
||||
|
||||
class CharsetMatch:
|
||||
def __init__(
|
||||
self,
|
||||
payload: bytes,
|
||||
guessed_encoding: str,
|
||||
mean_mess_ratio: float,
|
||||
has_sig_or_bom: bool,
|
||||
languages: "CoherenceMatches",
|
||||
decoded_payload: Optional[str] = None,
|
||||
):
|
||||
self._payload: bytes = payload
|
||||
|
||||
self._encoding: str = guessed_encoding
|
||||
self._mean_mess_ratio: float = mean_mess_ratio
|
||||
self._languages: CoherenceMatches = languages
|
||||
self._has_sig_or_bom: bool = has_sig_or_bom
|
||||
self._unicode_ranges: Optional[List[str]] = None
|
||||
|
||||
self._leaves: List[CharsetMatch] = []
|
||||
self._mean_coherence_ratio: float = 0.0
|
||||
|
||||
self._output_payload: Optional[bytes] = None
|
||||
self._output_encoding: Optional[str] = None
|
||||
|
||||
self._string: Optional[str] = decoded_payload
|
||||
|
||||
def __eq__(self, other: object) -> bool:
|
||||
if not isinstance(other, CharsetMatch):
|
||||
raise TypeError(
|
||||
"__eq__ cannot be invoked on {} and {}.".format(
|
||||
str(other.__class__), str(self.__class__)
|
||||
)
|
||||
)
|
||||
return self.encoding == other.encoding and self.fingerprint == other.fingerprint
|
||||
|
||||
def __lt__(self, other: object) -> bool:
|
||||
"""
|
||||
Implemented to make sorted available upon CharsetMatches items.
|
||||
"""
|
||||
if not isinstance(other, CharsetMatch):
|
||||
raise ValueError
|
||||
|
||||
chaos_difference: float = abs(self.chaos - other.chaos)
|
||||
coherence_difference: float = abs(self.coherence - other.coherence)
|
||||
|
||||
# Bellow 1% difference --> Use Coherence
|
||||
if chaos_difference < 0.01 and coherence_difference > 0.02:
|
||||
# When having a tough decision, use the result that decoded as many multi-byte as possible.
|
||||
if chaos_difference == 0.0 and self.coherence == other.coherence:
|
||||
return self.multi_byte_usage > other.multi_byte_usage
|
||||
return self.coherence > other.coherence
|
||||
|
||||
return self.chaos < other.chaos
|
||||
|
||||
@property
|
||||
def multi_byte_usage(self) -> float:
|
||||
return 1.0 - len(str(self)) / len(self.raw)
|
||||
|
||||
@property
|
||||
def chaos_secondary_pass(self) -> float:
|
||||
"""
|
||||
Check once again chaos in decoded text, except this time, with full content.
|
||||
Use with caution, this can be very slow.
|
||||
Notice: Will be removed in 3.0
|
||||
"""
|
||||
warnings.warn(
|
||||
"chaos_secondary_pass is deprecated and will be removed in 3.0",
|
||||
DeprecationWarning,
|
||||
)
|
||||
return mess_ratio(str(self), 1.0)
|
||||
|
||||
@property
|
||||
def coherence_non_latin(self) -> float:
|
||||
"""
|
||||
Coherence ratio on the first non-latin language detected if ANY.
|
||||
Notice: Will be removed in 3.0
|
||||
"""
|
||||
warnings.warn(
|
||||
"coherence_non_latin is deprecated and will be removed in 3.0",
|
||||
DeprecationWarning,
|
||||
)
|
||||
return 0.0
|
||||
|
||||
@property
|
||||
def w_counter(self) -> TypeCounter[str]:
|
||||
"""
|
||||
Word counter instance on decoded text.
|
||||
Notice: Will be removed in 3.0
|
||||
"""
|
||||
warnings.warn(
|
||||
"w_counter is deprecated and will be removed in 3.0", DeprecationWarning
|
||||
)
|
||||
|
||||
string_printable_only = sub(NOT_PRINTABLE_PATTERN, " ", str(self).lower())
|
||||
|
||||
return Counter(string_printable_only.split())
|
||||
|
||||
def __str__(self) -> str:
|
||||
# Lazy Str Loading
|
||||
if self._string is None:
|
||||
self._string = str(self._payload, self._encoding, "strict")
|
||||
return self._string
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return "<CharsetMatch '{}' bytes({})>".format(self.encoding, self.fingerprint)
|
||||
|
||||
def add_submatch(self, other: "CharsetMatch") -> None:
|
||||
if not isinstance(other, CharsetMatch) or other == self:
|
||||
raise ValueError(
|
||||
"Unable to add instance <{}> as a submatch of a CharsetMatch".format(
|
||||
other.__class__
|
||||
)
|
||||
)
|
||||
|
||||
other._string = None # Unload RAM usage; dirty trick.
|
||||
self._leaves.append(other)
|
||||
|
||||
@property
|
||||
def encoding(self) -> str:
|
||||
return self._encoding
|
||||
|
||||
@property
|
||||
def encoding_aliases(self) -> List[str]:
|
||||
"""
|
||||
Encoding name are known by many name, using this could help when searching for IBM855 when it's listed as CP855.
|
||||
"""
|
||||
also_known_as: List[str] = []
|
||||
for u, p in aliases.items():
|
||||
if self.encoding == u:
|
||||
also_known_as.append(p)
|
||||
elif self.encoding == p:
|
||||
also_known_as.append(u)
|
||||
return also_known_as
|
||||
|
||||
@property
|
||||
def bom(self) -> bool:
|
||||
return self._has_sig_or_bom
|
||||
|
||||
@property
|
||||
def byte_order_mark(self) -> bool:
|
||||
return self._has_sig_or_bom
|
||||
|
||||
@property
|
||||
def languages(self) -> List[str]:
|
||||
"""
|
||||
Return the complete list of possible languages found in decoded sequence.
|
||||
Usually not really useful. Returned list may be empty even if 'language' property return something != 'Unknown'.
|
||||
"""
|
||||
return [e[0] for e in self._languages]
|
||||
|
||||
@property
|
||||
def language(self) -> str:
|
||||
"""
|
||||
Most probable language found in decoded sequence. If none were detected or inferred, the property will return
|
||||
"Unknown".
|
||||
"""
|
||||
if not self._languages:
|
||||
# Trying to infer the language based on the given encoding
|
||||
# Its either English or we should not pronounce ourselves in certain cases.
|
||||
if "ascii" in self.could_be_from_charset:
|
||||
return "English"
|
||||
|
||||
# doing it there to avoid circular import
|
||||
from charset_normalizer.cd import encoding_languages, mb_encoding_languages
|
||||
|
||||
languages = (
|
||||
mb_encoding_languages(self.encoding)
|
||||
if is_multi_byte_encoding(self.encoding)
|
||||
else encoding_languages(self.encoding)
|
||||
)
|
||||
|
||||
if len(languages) == 0 or "Latin Based" in languages:
|
||||
return "Unknown"
|
||||
|
||||
return languages[0]
|
||||
|
||||
return self._languages[0][0]
|
||||
|
||||
@property
|
||||
def chaos(self) -> float:
|
||||
return self._mean_mess_ratio
|
||||
|
||||
@property
|
||||
def coherence(self) -> float:
|
||||
if not self._languages:
|
||||
return 0.0
|
||||
return self._languages[0][1]
|
||||
|
||||
@property
|
||||
def percent_chaos(self) -> float:
|
||||
return round(self.chaos * 100, ndigits=3)
|
||||
|
||||
@property
|
||||
def percent_coherence(self) -> float:
|
||||
return round(self.coherence * 100, ndigits=3)
|
||||
|
||||
@property
|
||||
def raw(self) -> bytes:
|
||||
"""
|
||||
Original untouched bytes.
|
||||
"""
|
||||
return self._payload
|
||||
|
||||
@property
|
||||
def submatch(self) -> List["CharsetMatch"]:
|
||||
return self._leaves
|
||||
|
||||
@property
|
||||
def has_submatch(self) -> bool:
|
||||
return len(self._leaves) > 0
|
||||
|
||||
@property
|
||||
def alphabets(self) -> List[str]:
|
||||
if self._unicode_ranges is not None:
|
||||
return self._unicode_ranges
|
||||
# list detected ranges
|
||||
detected_ranges: List[Optional[str]] = [
|
||||
unicode_range(char) for char in str(self)
|
||||
]
|
||||
# filter and sort
|
||||
self._unicode_ranges = sorted(list({r for r in detected_ranges if r}))
|
||||
return self._unicode_ranges
|
||||
|
||||
@property
|
||||
def could_be_from_charset(self) -> List[str]:
|
||||
"""
|
||||
The complete list of encoding that output the exact SAME str result and therefore could be the originating
|
||||
encoding.
|
||||
This list does include the encoding available in property 'encoding'.
|
||||
"""
|
||||
return [self._encoding] + [m.encoding for m in self._leaves]
|
||||
|
||||
def first(self) -> "CharsetMatch":
|
||||
"""
|
||||
Kept for BC reasons. Will be removed in 3.0.
|
||||
"""
|
||||
return self
|
||||
|
||||
def best(self) -> "CharsetMatch":
|
||||
"""
|
||||
Kept for BC reasons. Will be removed in 3.0.
|
||||
"""
|
||||
return self
|
||||
|
||||
def output(self, encoding: str = "utf_8") -> bytes:
|
||||
"""
|
||||
Method to get re-encoded bytes payload using given target encoding. Default to UTF-8.
|
||||
Any errors will be simply ignored by the encoder NOT replaced.
|
||||
"""
|
||||
if self._output_encoding is None or self._output_encoding != encoding:
|
||||
self._output_encoding = encoding
|
||||
self._output_payload = str(self).encode(encoding, "replace")
|
||||
|
||||
return self._output_payload # type: ignore
|
||||
|
||||
@property
|
||||
def fingerprint(self) -> str:
|
||||
"""
|
||||
Retrieve the unique SHA256 computed using the transformed (re-encoded) payload. Not the original one.
|
||||
"""
|
||||
return sha256(self.output()).hexdigest()
|
||||
|
||||
|
||||
class CharsetMatches:
|
||||
"""
|
||||
Container with every CharsetMatch items ordered by default from most probable to the less one.
|
||||
Act like a list(iterable) but does not implements all related methods.
|
||||
"""
|
||||
|
||||
def __init__(self, results: Optional[List[CharsetMatch]] = None):
|
||||
self._results: List[CharsetMatch] = sorted(results) if results else []
|
||||
|
||||
def __iter__(self) -> Iterator[CharsetMatch]:
|
||||
yield from self._results
|
||||
|
||||
def __getitem__(self, item: Union[int, str]) -> CharsetMatch:
|
||||
"""
|
||||
Retrieve a single item either by its position or encoding name (alias may be used here).
|
||||
Raise KeyError upon invalid index or encoding not present in results.
|
||||
"""
|
||||
if isinstance(item, int):
|
||||
return self._results[item]
|
||||
if isinstance(item, str):
|
||||
item = iana_name(item, False)
|
||||
for result in self._results:
|
||||
if item in result.could_be_from_charset:
|
||||
return result
|
||||
raise KeyError
|
||||
|
||||
def __len__(self) -> int:
|
||||
return len(self._results)
|
||||
|
||||
def __bool__(self) -> bool:
|
||||
return len(self._results) > 0
|
||||
|
||||
def append(self, item: CharsetMatch) -> None:
|
||||
"""
|
||||
Insert a single match. Will be inserted accordingly to preserve sort.
|
||||
Can be inserted as a submatch.
|
||||
"""
|
||||
if not isinstance(item, CharsetMatch):
|
||||
raise ValueError(
|
||||
"Cannot append instance '{}' to CharsetMatches".format(
|
||||
str(item.__class__)
|
||||
)
|
||||
)
|
||||
# We should disable the submatch factoring when the input file is too heavy (conserve RAM usage)
|
||||
if len(item.raw) <= TOO_BIG_SEQUENCE:
|
||||
for match in self._results:
|
||||
if match.fingerprint == item.fingerprint and match.chaos == item.chaos:
|
||||
match.add_submatch(item)
|
||||
return
|
||||
self._results.append(item)
|
||||
self._results = sorted(self._results)
|
||||
|
||||
def best(self) -> Optional["CharsetMatch"]:
|
||||
"""
|
||||
Simply return the first match. Strict equivalent to matches[0].
|
||||
"""
|
||||
if not self._results:
|
||||
return None
|
||||
return self._results[0]
|
||||
|
||||
def first(self) -> Optional["CharsetMatch"]:
|
||||
"""
|
||||
Redundant method, call the method best(). Kept for BC reasons.
|
||||
"""
|
||||
return self.best()
|
||||
|
||||
|
||||
CoherenceMatch = Tuple[str, float]
|
||||
CoherenceMatches = List[CoherenceMatch]
|
||||
|
||||
|
||||
class CliDetectionResult:
|
||||
def __init__(
|
||||
self,
|
||||
path: str,
|
||||
encoding: Optional[str],
|
||||
encoding_aliases: List[str],
|
||||
alternative_encodings: List[str],
|
||||
language: str,
|
||||
alphabets: List[str],
|
||||
has_sig_or_bom: bool,
|
||||
chaos: float,
|
||||
coherence: float,
|
||||
unicode_path: Optional[str],
|
||||
is_preferred: bool,
|
||||
):
|
||||
self.path: str = path
|
||||
self.unicode_path: Optional[str] = unicode_path
|
||||
self.encoding: Optional[str] = encoding
|
||||
self.encoding_aliases: List[str] = encoding_aliases
|
||||
self.alternative_encodings: List[str] = alternative_encodings
|
||||
self.language: str = language
|
||||
self.alphabets: List[str] = alphabets
|
||||
self.has_sig_or_bom: bool = has_sig_or_bom
|
||||
self.chaos: float = chaos
|
||||
self.coherence: float = coherence
|
||||
self.is_preferred: bool = is_preferred
|
||||
|
||||
@property
|
||||
def __dict__(self) -> Dict[str, Any]: # type: ignore
|
||||
return {
|
||||
"path": self.path,
|
||||
"encoding": self.encoding,
|
||||
"encoding_aliases": self.encoding_aliases,
|
||||
"alternative_encodings": self.alternative_encodings,
|
||||
"language": self.language,
|
||||
"alphabets": self.alphabets,
|
||||
"has_sig_or_bom": self.has_sig_or_bom,
|
||||
"chaos": self.chaos,
|
||||
"coherence": self.coherence,
|
||||
"unicode_path": self.unicode_path,
|
||||
"is_preferred": self.is_preferred,
|
||||
}
|
||||
|
||||
def to_json(self) -> str:
|
||||
return dumps(self.__dict__, ensure_ascii=True, indent=4)
|
||||
@@ -0,0 +1,424 @@
|
||||
try:
|
||||
# WARNING: unicodedata2 support is going to be removed in 3.0
|
||||
# Python is quickly catching up.
|
||||
import unicodedata2 as unicodedata
|
||||
except ImportError:
|
||||
import unicodedata # type: ignore[no-redef]
|
||||
|
||||
import importlib
|
||||
import logging
|
||||
from codecs import IncrementalDecoder
|
||||
from encodings.aliases import aliases
|
||||
from functools import lru_cache
|
||||
from re import findall
|
||||
from typing import Generator, List, Optional, Set, Tuple, Union
|
||||
|
||||
from _multibytecodec import MultibyteIncrementalDecoder
|
||||
|
||||
from .constant import (
|
||||
ENCODING_MARKS,
|
||||
IANA_SUPPORTED_SIMILAR,
|
||||
RE_POSSIBLE_ENCODING_INDICATION,
|
||||
UNICODE_RANGES_COMBINED,
|
||||
UNICODE_SECONDARY_RANGE_KEYWORD,
|
||||
UTF8_MAXIMAL_ALLOCATION,
|
||||
)
|
||||
|
||||
|
||||
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
||||
def is_accentuated(character: str) -> bool:
|
||||
try:
|
||||
description: str = unicodedata.name(character)
|
||||
except ValueError:
|
||||
return False
|
||||
return (
|
||||
"WITH GRAVE" in description
|
||||
or "WITH ACUTE" in description
|
||||
or "WITH CEDILLA" in description
|
||||
or "WITH DIAERESIS" in description
|
||||
or "WITH CIRCUMFLEX" in description
|
||||
or "WITH TILDE" in description
|
||||
)
|
||||
|
||||
|
||||
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
||||
def remove_accent(character: str) -> str:
|
||||
decomposed: str = unicodedata.decomposition(character)
|
||||
if not decomposed:
|
||||
return character
|
||||
|
||||
codes: List[str] = decomposed.split(" ")
|
||||
|
||||
return chr(int(codes[0], 16))
|
||||
|
||||
|
||||
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
||||
def unicode_range(character: str) -> Optional[str]:
|
||||
"""
|
||||
Retrieve the Unicode range official name from a single character.
|
||||
"""
|
||||
character_ord: int = ord(character)
|
||||
|
||||
for range_name, ord_range in UNICODE_RANGES_COMBINED.items():
|
||||
if character_ord in ord_range:
|
||||
return range_name
|
||||
|
||||
return None
|
||||
|
||||
|
||||
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
||||
def is_latin(character: str) -> bool:
|
||||
try:
|
||||
description: str = unicodedata.name(character)
|
||||
except ValueError:
|
||||
return False
|
||||
return "LATIN" in description
|
||||
|
||||
|
||||
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
||||
def is_ascii(character: str) -> bool:
|
||||
try:
|
||||
character.encode("ascii")
|
||||
except UnicodeEncodeError:
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
||||
def is_punctuation(character: str) -> bool:
|
||||
character_category: str = unicodedata.category(character)
|
||||
|
||||
if "P" in character_category:
|
||||
return True
|
||||
|
||||
character_range: Optional[str] = unicode_range(character)
|
||||
|
||||
if character_range is None:
|
||||
return False
|
||||
|
||||
return "Punctuation" in character_range
|
||||
|
||||
|
||||
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
||||
def is_symbol(character: str) -> bool:
|
||||
character_category: str = unicodedata.category(character)
|
||||
|
||||
if "S" in character_category or "N" in character_category:
|
||||
return True
|
||||
|
||||
character_range: Optional[str] = unicode_range(character)
|
||||
|
||||
if character_range is None:
|
||||
return False
|
||||
|
||||
return "Forms" in character_range
|
||||
|
||||
|
||||
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
||||
def is_emoticon(character: str) -> bool:
|
||||
character_range: Optional[str] = unicode_range(character)
|
||||
|
||||
if character_range is None:
|
||||
return False
|
||||
|
||||
return "Emoticons" in character_range
|
||||
|
||||
|
||||
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
||||
def is_separator(character: str) -> bool:
|
||||
if character.isspace() or character in {"|", "+", ",", ";", "<", ">"}:
|
||||
return True
|
||||
|
||||
character_category: str = unicodedata.category(character)
|
||||
|
||||
return "Z" in character_category
|
||||
|
||||
|
||||
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
||||
def is_case_variable(character: str) -> bool:
|
||||
return character.islower() != character.isupper()
|
||||
|
||||
|
||||
def is_private_use_only(character: str) -> bool:
|
||||
character_category: str = unicodedata.category(character)
|
||||
|
||||
return character_category == "Co"
|
||||
|
||||
|
||||
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
||||
def is_cjk(character: str) -> bool:
|
||||
try:
|
||||
character_name = unicodedata.name(character)
|
||||
except ValueError:
|
||||
return False
|
||||
|
||||
return "CJK" in character_name
|
||||
|
||||
|
||||
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
||||
def is_hiragana(character: str) -> bool:
|
||||
try:
|
||||
character_name = unicodedata.name(character)
|
||||
except ValueError:
|
||||
return False
|
||||
|
||||
return "HIRAGANA" in character_name
|
||||
|
||||
|
||||
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
||||
def is_katakana(character: str) -> bool:
|
||||
try:
|
||||
character_name = unicodedata.name(character)
|
||||
except ValueError:
|
||||
return False
|
||||
|
||||
return "KATAKANA" in character_name
|
||||
|
||||
|
||||
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
||||
def is_hangul(character: str) -> bool:
|
||||
try:
|
||||
character_name = unicodedata.name(character)
|
||||
except ValueError:
|
||||
return False
|
||||
|
||||
return "HANGUL" in character_name
|
||||
|
||||
|
||||
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
||||
def is_thai(character: str) -> bool:
|
||||
try:
|
||||
character_name = unicodedata.name(character)
|
||||
except ValueError:
|
||||
return False
|
||||
|
||||
return "THAI" in character_name
|
||||
|
||||
|
||||
@lru_cache(maxsize=len(UNICODE_RANGES_COMBINED))
|
||||
def is_unicode_range_secondary(range_name: str) -> bool:
|
||||
return any(keyword in range_name for keyword in UNICODE_SECONDARY_RANGE_KEYWORD)
|
||||
|
||||
|
||||
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
||||
def is_unprintable(character: str) -> bool:
|
||||
return (
|
||||
character.isspace() is False # includes \n \t \r \v
|
||||
and character.isprintable() is False
|
||||
and character != "\x1A" # Why? Its the ASCII substitute character.
|
||||
and character != "\ufeff" # bug discovered in Python,
|
||||
# Zero Width No-Break Space located in Arabic Presentation Forms-B, Unicode 1.1 not acknowledged as space.
|
||||
)
|
||||
|
||||
|
||||
def any_specified_encoding(sequence: bytes, search_zone: int = 4096) -> Optional[str]:
|
||||
"""
|
||||
Extract using ASCII-only decoder any specified encoding in the first n-bytes.
|
||||
"""
|
||||
if not isinstance(sequence, bytes):
|
||||
raise TypeError
|
||||
|
||||
seq_len: int = len(sequence)
|
||||
|
||||
results: List[str] = findall(
|
||||
RE_POSSIBLE_ENCODING_INDICATION,
|
||||
sequence[: min(seq_len, search_zone)].decode("ascii", errors="ignore"),
|
||||
)
|
||||
|
||||
if len(results) == 0:
|
||||
return None
|
||||
|
||||
for specified_encoding in results:
|
||||
specified_encoding = specified_encoding.lower().replace("-", "_")
|
||||
|
||||
encoding_alias: str
|
||||
encoding_iana: str
|
||||
|
||||
for encoding_alias, encoding_iana in aliases.items():
|
||||
if encoding_alias == specified_encoding:
|
||||
return encoding_iana
|
||||
if encoding_iana == specified_encoding:
|
||||
return encoding_iana
|
||||
|
||||
return None
|
||||
|
||||
|
||||
@lru_cache(maxsize=128)
|
||||
def is_multi_byte_encoding(name: str) -> bool:
|
||||
"""
|
||||
Verify is a specific encoding is a multi byte one based on it IANA name
|
||||
"""
|
||||
return name in {
|
||||
"utf_8",
|
||||
"utf_8_sig",
|
||||
"utf_16",
|
||||
"utf_16_be",
|
||||
"utf_16_le",
|
||||
"utf_32",
|
||||
"utf_32_le",
|
||||
"utf_32_be",
|
||||
"utf_7",
|
||||
} or issubclass(
|
||||
importlib.import_module("encodings.{}".format(name)).IncrementalDecoder,
|
||||
MultibyteIncrementalDecoder,
|
||||
)
|
||||
|
||||
|
||||
def identify_sig_or_bom(sequence: bytes) -> Tuple[Optional[str], bytes]:
|
||||
"""
|
||||
Identify and extract SIG/BOM in given sequence.
|
||||
"""
|
||||
|
||||
for iana_encoding in ENCODING_MARKS:
|
||||
marks: Union[bytes, List[bytes]] = ENCODING_MARKS[iana_encoding]
|
||||
|
||||
if isinstance(marks, bytes):
|
||||
marks = [marks]
|
||||
|
||||
for mark in marks:
|
||||
if sequence.startswith(mark):
|
||||
return iana_encoding, mark
|
||||
|
||||
return None, b""
|
||||
|
||||
|
||||
def should_strip_sig_or_bom(iana_encoding: str) -> bool:
|
||||
return iana_encoding not in {"utf_16", "utf_32"}
|
||||
|
||||
|
||||
def iana_name(cp_name: str, strict: bool = True) -> str:
|
||||
cp_name = cp_name.lower().replace("-", "_")
|
||||
|
||||
encoding_alias: str
|
||||
encoding_iana: str
|
||||
|
||||
for encoding_alias, encoding_iana in aliases.items():
|
||||
if cp_name in [encoding_alias, encoding_iana]:
|
||||
return encoding_iana
|
||||
|
||||
if strict:
|
||||
raise ValueError("Unable to retrieve IANA for '{}'".format(cp_name))
|
||||
|
||||
return cp_name
|
||||
|
||||
|
||||
def range_scan(decoded_sequence: str) -> List[str]:
|
||||
ranges: Set[str] = set()
|
||||
|
||||
for character in decoded_sequence:
|
||||
character_range: Optional[str] = unicode_range(character)
|
||||
|
||||
if character_range is None:
|
||||
continue
|
||||
|
||||
ranges.add(character_range)
|
||||
|
||||
return list(ranges)
|
||||
|
||||
|
||||
def cp_similarity(iana_name_a: str, iana_name_b: str) -> float:
|
||||
|
||||
if is_multi_byte_encoding(iana_name_a) or is_multi_byte_encoding(iana_name_b):
|
||||
return 0.0
|
||||
|
||||
decoder_a = importlib.import_module(
|
||||
"encodings.{}".format(iana_name_a)
|
||||
).IncrementalDecoder
|
||||
decoder_b = importlib.import_module(
|
||||
"encodings.{}".format(iana_name_b)
|
||||
).IncrementalDecoder
|
||||
|
||||
id_a: IncrementalDecoder = decoder_a(errors="ignore")
|
||||
id_b: IncrementalDecoder = decoder_b(errors="ignore")
|
||||
|
||||
character_match_count: int = 0
|
||||
|
||||
for i in range(255):
|
||||
to_be_decoded: bytes = bytes([i])
|
||||
if id_a.decode(to_be_decoded) == id_b.decode(to_be_decoded):
|
||||
character_match_count += 1
|
||||
|
||||
return character_match_count / 254
|
||||
|
||||
|
||||
def is_cp_similar(iana_name_a: str, iana_name_b: str) -> bool:
|
||||
"""
|
||||
Determine if two code page are at least 80% similar. IANA_SUPPORTED_SIMILAR dict was generated using
|
||||
the function cp_similarity.
|
||||
"""
|
||||
return (
|
||||
iana_name_a in IANA_SUPPORTED_SIMILAR
|
||||
and iana_name_b in IANA_SUPPORTED_SIMILAR[iana_name_a]
|
||||
)
|
||||
|
||||
|
||||
def set_logging_handler(
|
||||
name: str = "charset_normalizer",
|
||||
level: int = logging.INFO,
|
||||
format_string: str = "%(asctime)s | %(levelname)s | %(message)s",
|
||||
) -> None:
|
||||
|
||||
logger = logging.getLogger(name)
|
||||
logger.setLevel(level)
|
||||
|
||||
handler = logging.StreamHandler()
|
||||
handler.setFormatter(logging.Formatter(format_string))
|
||||
logger.addHandler(handler)
|
||||
|
||||
|
||||
def cut_sequence_chunks(
|
||||
sequences: bytes,
|
||||
encoding_iana: str,
|
||||
offsets: range,
|
||||
chunk_size: int,
|
||||
bom_or_sig_available: bool,
|
||||
strip_sig_or_bom: bool,
|
||||
sig_payload: bytes,
|
||||
is_multi_byte_decoder: bool,
|
||||
decoded_payload: Optional[str] = None,
|
||||
) -> Generator[str, None, None]:
|
||||
|
||||
if decoded_payload and is_multi_byte_decoder is False:
|
||||
for i in offsets:
|
||||
chunk = decoded_payload[i : i + chunk_size]
|
||||
if not chunk:
|
||||
break
|
||||
yield chunk
|
||||
else:
|
||||
for i in offsets:
|
||||
chunk_end = i + chunk_size
|
||||
if chunk_end > len(sequences) + 8:
|
||||
continue
|
||||
|
||||
cut_sequence = sequences[i : i + chunk_size]
|
||||
|
||||
if bom_or_sig_available and strip_sig_or_bom is False:
|
||||
cut_sequence = sig_payload + cut_sequence
|
||||
|
||||
chunk = cut_sequence.decode(
|
||||
encoding_iana,
|
||||
errors="ignore" if is_multi_byte_decoder else "strict",
|
||||
)
|
||||
|
||||
# multi-byte bad cutting detector and adjustment
|
||||
# not the cleanest way to perform that fix but clever enough for now.
|
||||
if is_multi_byte_decoder and i > 0 and sequences[i] >= 0x80:
|
||||
|
||||
chunk_partial_size_chk: int = min(chunk_size, 16)
|
||||
|
||||
if (
|
||||
decoded_payload
|
||||
and chunk[:chunk_partial_size_chk] not in decoded_payload
|
||||
):
|
||||
for j in range(i, i - 4, -1):
|
||||
cut_sequence = sequences[j:chunk_end]
|
||||
|
||||
if bom_or_sig_available and strip_sig_or_bom is False:
|
||||
cut_sequence = sig_payload + cut_sequence
|
||||
|
||||
chunk = cut_sequence.decode(encoding_iana, errors="ignore")
|
||||
|
||||
if chunk[:chunk_partial_size_chk] in decoded_payload:
|
||||
break
|
||||
|
||||
yield chunk
|
||||
@@ -0,0 +1,6 @@
|
||||
"""
|
||||
Expose version
|
||||
"""
|
||||
|
||||
__version__ = "2.1.1"
|
||||
VERSION = __version__.split(".")
|
||||
@@ -0,0 +1 @@
|
||||
pip
|
||||
@@ -0,0 +1,29 @@
|
||||
BSD 3-Clause License
|
||||
|
||||
Copyright (c) 2013-2021, Kim Davies
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
|
||||
1. Redistributions of source code must retain the above copyright notice, this
|
||||
list of conditions and the following disclaimer.
|
||||
|
||||
2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
this list of conditions and the following disclaimer in the documentation
|
||||
and/or other materials provided with the distribution.
|
||||
|
||||
3. Neither the name of the copyright holder nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
@@ -0,0 +1,242 @@
|
||||
Metadata-Version: 2.1
|
||||
Name: idna
|
||||
Version: 3.4
|
||||
Summary: Internationalized Domain Names in Applications (IDNA)
|
||||
Author-email: Kim Davies <kim@cynosure.com.au>
|
||||
Requires-Python: >=3.5
|
||||
Description-Content-Type: text/x-rst
|
||||
Classifier: Development Status :: 5 - Production/Stable
|
||||
Classifier: Intended Audience :: Developers
|
||||
Classifier: Intended Audience :: System Administrators
|
||||
Classifier: License :: OSI Approved :: BSD License
|
||||
Classifier: Operating System :: OS Independent
|
||||
Classifier: Programming Language :: Python
|
||||
Classifier: Programming Language :: Python :: 3
|
||||
Classifier: Programming Language :: Python :: 3 :: Only
|
||||
Classifier: Programming Language :: Python :: 3.5
|
||||
Classifier: Programming Language :: Python :: 3.6
|
||||
Classifier: Programming Language :: Python :: 3.7
|
||||
Classifier: Programming Language :: Python :: 3.8
|
||||
Classifier: Programming Language :: Python :: 3.9
|
||||
Classifier: Programming Language :: Python :: 3.10
|
||||
Classifier: Programming Language :: Python :: 3.11
|
||||
Classifier: Programming Language :: Python :: Implementation :: CPython
|
||||
Classifier: Programming Language :: Python :: Implementation :: PyPy
|
||||
Classifier: Topic :: Internet :: Name Service (DNS)
|
||||
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
||||
Classifier: Topic :: Utilities
|
||||
Project-URL: Changelog, https://github.com/kjd/idna/blob/master/HISTORY.rst
|
||||
Project-URL: Issue tracker, https://github.com/kjd/idna/issues
|
||||
Project-URL: Source, https://github.com/kjd/idna
|
||||
|
||||
Internationalized Domain Names in Applications (IDNA)
|
||||
=====================================================
|
||||
|
||||
Support for the Internationalized Domain Names in
|
||||
Applications (IDNA) protocol as specified in `RFC 5891
|
||||
<https://tools.ietf.org/html/rfc5891>`_. This is the latest version of
|
||||
the protocol and is sometimes referred to as “IDNA 2008”.
|
||||
|
||||
This library also provides support for Unicode Technical
|
||||
Standard 46, `Unicode IDNA Compatibility Processing
|
||||
<https://unicode.org/reports/tr46/>`_.
|
||||
|
||||
This acts as a suitable replacement for the “encodings.idna”
|
||||
module that comes with the Python standard library, but which
|
||||
only supports the older superseded IDNA specification (`RFC 3490
|
||||
<https://tools.ietf.org/html/rfc3490>`_).
|
||||
|
||||
Basic functions are simply executed:
|
||||
|
||||
.. code-block:: pycon
|
||||
|
||||
>>> import idna
|
||||
>>> idna.encode('ドメイン.テスト')
|
||||
b'xn--eckwd4c7c.xn--zckzah'
|
||||
>>> print(idna.decode('xn--eckwd4c7c.xn--zckzah'))
|
||||
ドメイン.テスト
|
||||
|
||||
|
||||
Installation
|
||||
------------
|
||||
|
||||
This package is available for installation from PyPI:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
$ python3 -m pip install idna
|
||||
|
||||
|
||||
Usage
|
||||
-----
|
||||
|
||||
For typical usage, the ``encode`` and ``decode`` functions will take a
|
||||
domain name argument and perform a conversion to A-labels or U-labels
|
||||
respectively.
|
||||
|
||||
.. code-block:: pycon
|
||||
|
||||
>>> import idna
|
||||
>>> idna.encode('ドメイン.テスト')
|
||||
b'xn--eckwd4c7c.xn--zckzah'
|
||||
>>> print(idna.decode('xn--eckwd4c7c.xn--zckzah'))
|
||||
ドメイン.テスト
|
||||
|
||||
You may use the codec encoding and decoding methods using the
|
||||
``idna.codec`` module:
|
||||
|
||||
.. code-block:: pycon
|
||||
|
||||
>>> import idna.codec
|
||||
>>> print('домен.испытание'.encode('idna'))
|
||||
b'xn--d1acufc.xn--80akhbyknj4f'
|
||||
>>> print(b'xn--d1acufc.xn--80akhbyknj4f'.decode('idna'))
|
||||
домен.испытание
|
||||
|
||||
Conversions can be applied at a per-label basis using the ``ulabel`` or
|
||||
``alabel`` functions if necessary:
|
||||
|
||||
.. code-block:: pycon
|
||||
|
||||
>>> idna.alabel('测试')
|
||||
b'xn--0zwm56d'
|
||||
|
||||
Compatibility Mapping (UTS #46)
|
||||
+++++++++++++++++++++++++++++++
|
||||
|
||||
As described in `RFC 5895 <https://tools.ietf.org/html/rfc5895>`_, the
|
||||
IDNA specification does not normalize input from different potential
|
||||
ways a user may input a domain name. This functionality, known as
|
||||
a “mapping”, is considered by the specification to be a local
|
||||
user-interface issue distinct from IDNA conversion functionality.
|
||||
|
||||
This library provides one such mapping, that was developed by the
|
||||
Unicode Consortium. Known as `Unicode IDNA Compatibility Processing
|
||||
<https://unicode.org/reports/tr46/>`_, it provides for both a regular
|
||||
mapping for typical applications, as well as a transitional mapping to
|
||||
help migrate from older IDNA 2003 applications.
|
||||
|
||||
For example, “Königsgäßchen” is not a permissible label as *LATIN
|
||||
CAPITAL LETTER K* is not allowed (nor are capital letters in general).
|
||||
UTS 46 will convert this into lower case prior to applying the IDNA
|
||||
conversion.
|
||||
|
||||
.. code-block:: pycon
|
||||
|
||||
>>> import idna
|
||||
>>> idna.encode('Königsgäßchen')
|
||||
...
|
||||
idna.core.InvalidCodepoint: Codepoint U+004B at position 1 of 'Königsgäßchen' not allowed
|
||||
>>> idna.encode('Königsgäßchen', uts46=True)
|
||||
b'xn--knigsgchen-b4a3dun'
|
||||
>>> print(idna.decode('xn--knigsgchen-b4a3dun'))
|
||||
königsgäßchen
|
||||
|
||||
Transitional processing provides conversions to help transition from
|
||||
the older 2003 standard to the current standard. For example, in the
|
||||
original IDNA specification, the *LATIN SMALL LETTER SHARP S* (ß) was
|
||||
converted into two *LATIN SMALL LETTER S* (ss), whereas in the current
|
||||
IDNA specification this conversion is not performed.
|
||||
|
||||
.. code-block:: pycon
|
||||
|
||||
>>> idna.encode('Königsgäßchen', uts46=True, transitional=True)
|
||||
'xn--knigsgsschen-lcb0w'
|
||||
|
||||
Implementors should use transitional processing with caution, only in
|
||||
rare cases where conversion from legacy labels to current labels must be
|
||||
performed (i.e. IDNA implementations that pre-date 2008). For typical
|
||||
applications that just need to convert labels, transitional processing
|
||||
is unlikely to be beneficial and could produce unexpected incompatible
|
||||
results.
|
||||
|
||||
``encodings.idna`` Compatibility
|
||||
++++++++++++++++++++++++++++++++
|
||||
|
||||
Function calls from the Python built-in ``encodings.idna`` module are
|
||||
mapped to their IDNA 2008 equivalents using the ``idna.compat`` module.
|
||||
Simply substitute the ``import`` clause in your code to refer to the new
|
||||
module name.
|
||||
|
||||
Exceptions
|
||||
----------
|
||||
|
||||
All errors raised during the conversion following the specification
|
||||
should raise an exception derived from the ``idna.IDNAError`` base
|
||||
class.
|
||||
|
||||
More specific exceptions that may be generated as ``idna.IDNABidiError``
|
||||
when the error reflects an illegal combination of left-to-right and
|
||||
right-to-left characters in a label; ``idna.InvalidCodepoint`` when
|
||||
a specific codepoint is an illegal character in an IDN label (i.e.
|
||||
INVALID); and ``idna.InvalidCodepointContext`` when the codepoint is
|
||||
illegal based on its positional context (i.e. it is CONTEXTO or CONTEXTJ
|
||||
but the contextual requirements are not satisfied.)
|
||||
|
||||
Building and Diagnostics
|
||||
------------------------
|
||||
|
||||
The IDNA and UTS 46 functionality relies upon pre-calculated lookup
|
||||
tables for performance. These tables are derived from computing against
|
||||
eligibility criteria in the respective standards. These tables are
|
||||
computed using the command-line script ``tools/idna-data``.
|
||||
|
||||
This tool will fetch relevant codepoint data from the Unicode repository
|
||||
and perform the required calculations to identify eligibility. There are
|
||||
three main modes:
|
||||
|
||||
* ``idna-data make-libdata``. Generates ``idnadata.py`` and
|
||||
``uts46data.py``, the pre-calculated lookup tables using for IDNA and
|
||||
UTS 46 conversions. Implementors who wish to track this library against
|
||||
a different Unicode version may use this tool to manually generate a
|
||||
different version of the ``idnadata.py`` and ``uts46data.py`` files.
|
||||
|
||||
* ``idna-data make-table``. Generate a table of the IDNA disposition
|
||||
(e.g. PVALID, CONTEXTJ, CONTEXTO) in the format found in Appendix
|
||||
B.1 of RFC 5892 and the pre-computed tables published by `IANA
|
||||
<https://www.iana.org/>`_.
|
||||
|
||||
* ``idna-data U+0061``. Prints debugging output on the various
|
||||
properties associated with an individual Unicode codepoint (in this
|
||||
case, U+0061), that are used to assess the IDNA and UTS 46 status of a
|
||||
codepoint. This is helpful in debugging or analysis.
|
||||
|
||||
The tool accepts a number of arguments, described using ``idna-data
|
||||
-h``. Most notably, the ``--version`` argument allows the specification
|
||||
of the version of Unicode to use in computing the table data. For
|
||||
example, ``idna-data --version 9.0.0 make-libdata`` will generate
|
||||
library data against Unicode 9.0.0.
|
||||
|
||||
|
||||
Additional Notes
|
||||
----------------
|
||||
|
||||
* **Packages**. The latest tagged release version is published in the
|
||||
`Python Package Index <https://pypi.org/project/idna/>`_.
|
||||
|
||||
* **Version support**. This library supports Python 3.5 and higher.
|
||||
As this library serves as a low-level toolkit for a variety of
|
||||
applications, many of which strive for broad compatibility with older
|
||||
Python versions, there is no rush to remove older intepreter support.
|
||||
Removing support for older versions should be well justified in that the
|
||||
maintenance burden has become too high.
|
||||
|
||||
* **Python 2**. Python 2 is supported by version 2.x of this library.
|
||||
While active development of the version 2.x series has ended, notable
|
||||
issues being corrected may be backported to 2.x. Use "idna<3" in your
|
||||
requirements file if you need this library for a Python 2 application.
|
||||
|
||||
* **Testing**. The library has a test suite based on each rule of the
|
||||
IDNA specification, as well as tests that are provided as part of the
|
||||
Unicode Technical Standard 46, `Unicode IDNA Compatibility Processing
|
||||
<https://unicode.org/reports/tr46/>`_.
|
||||
|
||||
* **Emoji**. It is an occasional request to support emoji domains in
|
||||
this library. Encoding of symbols like emoji is expressly prohibited by
|
||||
the technical standard IDNA 2008 and emoji domains are broadly phased
|
||||
out across the domain industry due to associated security risks. For
|
||||
now, applications that wish need to support these non-compliant labels
|
||||
may wish to consider trying the encode/decode operation in this library
|
||||
first, and then falling back to using `encodings.idna`. See `the Github
|
||||
project <https://github.com/kjd/idna/issues/18>`_ for more discussion.
|
||||
|
||||
@@ -0,0 +1,22 @@
|
||||
idna-3.4.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
|
||||
idna-3.4.dist-info/LICENSE.md,sha256=otbk2UC9JNvnuWRc3hmpeSzFHbeuDVrNMBrIYMqj6DY,1523
|
||||
idna-3.4.dist-info/METADATA,sha256=8aLSf9MFS7oB26pZh2hprg7eJp0UJSc-3rpf_evp4DA,9830
|
||||
idna-3.4.dist-info/RECORD,,
|
||||
idna-3.4.dist-info/WHEEL,sha256=4TfKIB_xu-04bc2iKz6_zFt-gEFEEDU_31HGhqzOCE8,81
|
||||
idna/__init__.py,sha256=KJQN1eQBr8iIK5SKrJ47lXvxG0BJ7Lm38W4zT0v_8lk,849
|
||||
idna/__pycache__/__init__.cpython-310.pyc,,
|
||||
idna/__pycache__/codec.cpython-310.pyc,,
|
||||
idna/__pycache__/compat.cpython-310.pyc,,
|
||||
idna/__pycache__/core.cpython-310.pyc,,
|
||||
idna/__pycache__/idnadata.cpython-310.pyc,,
|
||||
idna/__pycache__/intranges.cpython-310.pyc,,
|
||||
idna/__pycache__/package_data.cpython-310.pyc,,
|
||||
idna/__pycache__/uts46data.cpython-310.pyc,,
|
||||
idna/codec.py,sha256=6ly5odKfqrytKT9_7UrlGklHnf1DSK2r9C6cSM4sa28,3374
|
||||
idna/compat.py,sha256=0_sOEUMT4CVw9doD3vyRhX80X19PwqFoUBs7gWsFME4,321
|
||||
idna/core.py,sha256=1JxchwKzkxBSn7R_oCE12oBu3eVux0VzdxolmIad24M,12950
|
||||
idna/idnadata.py,sha256=xUjqKqiJV8Ho_XzBpAtv5JFoVPSupK-SUXvtjygUHqw,44375
|
||||
idna/intranges.py,sha256=YBr4fRYuWH7kTKS2tXlFjM24ZF1Pdvcir-aywniInqg,1881
|
||||
idna/package_data.py,sha256=C_jHJzmX8PI4xq0jpzmcTMxpb5lDsq4o5VyxQzlVrZE,21
|
||||
idna/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
||||
idna/uts46data.py,sha256=zvjZU24s58_uAS850Mcd0NnD0X7_gCMAMjzWNIeUJdc,206539
|
||||
@@ -0,0 +1,4 @@
|
||||
Wheel-Version: 1.0
|
||||
Generator: flit 3.7.1
|
||||
Root-Is-Purelib: true
|
||||
Tag: py3-none-any
|
||||
@@ -0,0 +1,44 @@
|
||||
from .package_data import __version__
|
||||
from .core import (
|
||||
IDNABidiError,
|
||||
IDNAError,
|
||||
InvalidCodepoint,
|
||||
InvalidCodepointContext,
|
||||
alabel,
|
||||
check_bidi,
|
||||
check_hyphen_ok,
|
||||
check_initial_combiner,
|
||||
check_label,
|
||||
check_nfc,
|
||||
decode,
|
||||
encode,
|
||||
ulabel,
|
||||
uts46_remap,
|
||||
valid_contextj,
|
||||
valid_contexto,
|
||||
valid_label_length,
|
||||
valid_string_length,
|
||||
)
|
||||
from .intranges import intranges_contain
|
||||
|
||||
__all__ = [
|
||||
"IDNABidiError",
|
||||
"IDNAError",
|
||||
"InvalidCodepoint",
|
||||
"InvalidCodepointContext",
|
||||
"alabel",
|
||||
"check_bidi",
|
||||
"check_hyphen_ok",
|
||||
"check_initial_combiner",
|
||||
"check_label",
|
||||
"check_nfc",
|
||||
"decode",
|
||||
"encode",
|
||||
"intranges_contain",
|
||||
"ulabel",
|
||||
"uts46_remap",
|
||||
"valid_contextj",
|
||||
"valid_contexto",
|
||||
"valid_label_length",
|
||||
"valid_string_length",
|
||||
]
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,112 @@
|
||||
from .core import encode, decode, alabel, ulabel, IDNAError
|
||||
import codecs
|
||||
import re
|
||||
from typing import Tuple, Optional
|
||||
|
||||
_unicode_dots_re = re.compile('[\u002e\u3002\uff0e\uff61]')
|
||||
|
||||
class Codec(codecs.Codec):
|
||||
|
||||
def encode(self, data: str, errors: str = 'strict') -> Tuple[bytes, int]:
|
||||
if errors != 'strict':
|
||||
raise IDNAError('Unsupported error handling \"{}\"'.format(errors))
|
||||
|
||||
if not data:
|
||||
return b"", 0
|
||||
|
||||
return encode(data), len(data)
|
||||
|
||||
def decode(self, data: bytes, errors: str = 'strict') -> Tuple[str, int]:
|
||||
if errors != 'strict':
|
||||
raise IDNAError('Unsupported error handling \"{}\"'.format(errors))
|
||||
|
||||
if not data:
|
||||
return '', 0
|
||||
|
||||
return decode(data), len(data)
|
||||
|
||||
class IncrementalEncoder(codecs.BufferedIncrementalEncoder):
|
||||
def _buffer_encode(self, data: str, errors: str, final: bool) -> Tuple[str, int]: # type: ignore
|
||||
if errors != 'strict':
|
||||
raise IDNAError('Unsupported error handling \"{}\"'.format(errors))
|
||||
|
||||
if not data:
|
||||
return "", 0
|
||||
|
||||
labels = _unicode_dots_re.split(data)
|
||||
trailing_dot = ''
|
||||
if labels:
|
||||
if not labels[-1]:
|
||||
trailing_dot = '.'
|
||||
del labels[-1]
|
||||
elif not final:
|
||||
# Keep potentially unfinished label until the next call
|
||||
del labels[-1]
|
||||
if labels:
|
||||
trailing_dot = '.'
|
||||
|
||||
result = []
|
||||
size = 0
|
||||
for label in labels:
|
||||
result.append(alabel(label))
|
||||
if size:
|
||||
size += 1
|
||||
size += len(label)
|
||||
|
||||
# Join with U+002E
|
||||
result_str = '.'.join(result) + trailing_dot # type: ignore
|
||||
size += len(trailing_dot)
|
||||
return result_str, size
|
||||
|
||||
class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
|
||||
def _buffer_decode(self, data: str, errors: str, final: bool) -> Tuple[str, int]: # type: ignore
|
||||
if errors != 'strict':
|
||||
raise IDNAError('Unsupported error handling \"{}\"'.format(errors))
|
||||
|
||||
if not data:
|
||||
return ('', 0)
|
||||
|
||||
labels = _unicode_dots_re.split(data)
|
||||
trailing_dot = ''
|
||||
if labels:
|
||||
if not labels[-1]:
|
||||
trailing_dot = '.'
|
||||
del labels[-1]
|
||||
elif not final:
|
||||
# Keep potentially unfinished label until the next call
|
||||
del labels[-1]
|
||||
if labels:
|
||||
trailing_dot = '.'
|
||||
|
||||
result = []
|
||||
size = 0
|
||||
for label in labels:
|
||||
result.append(ulabel(label))
|
||||
if size:
|
||||
size += 1
|
||||
size += len(label)
|
||||
|
||||
result_str = '.'.join(result) + trailing_dot
|
||||
size += len(trailing_dot)
|
||||
return (result_str, size)
|
||||
|
||||
|
||||
class StreamWriter(Codec, codecs.StreamWriter):
|
||||
pass
|
||||
|
||||
|
||||
class StreamReader(Codec, codecs.StreamReader):
|
||||
pass
|
||||
|
||||
|
||||
def getregentry() -> codecs.CodecInfo:
|
||||
# Compatibility as a search_function for codecs.register()
|
||||
return codecs.CodecInfo(
|
||||
name='idna',
|
||||
encode=Codec().encode, # type: ignore
|
||||
decode=Codec().decode, # type: ignore
|
||||
incrementalencoder=IncrementalEncoder,
|
||||
incrementaldecoder=IncrementalDecoder,
|
||||
streamwriter=StreamWriter,
|
||||
streamreader=StreamReader,
|
||||
)
|
||||
@@ -0,0 +1,13 @@
|
||||
from .core import *
|
||||
from .codec import *
|
||||
from typing import Any, Union
|
||||
|
||||
def ToASCII(label: str) -> bytes:
|
||||
return encode(label)
|
||||
|
||||
def ToUnicode(label: Union[bytes, bytearray]) -> str:
|
||||
return decode(label)
|
||||
|
||||
def nameprep(s: Any) -> None:
|
||||
raise NotImplementedError('IDNA 2008 does not utilise nameprep protocol')
|
||||
|
||||
@@ -0,0 +1,400 @@
|
||||
from . import idnadata
|
||||
import bisect
|
||||
import unicodedata
|
||||
import re
|
||||
from typing import Union, Optional
|
||||
from .intranges import intranges_contain
|
||||
|
||||
_virama_combining_class = 9
|
||||
_alabel_prefix = b'xn--'
|
||||
_unicode_dots_re = re.compile('[\u002e\u3002\uff0e\uff61]')
|
||||
|
||||
class IDNAError(UnicodeError):
|
||||
""" Base exception for all IDNA-encoding related problems """
|
||||
pass
|
||||
|
||||
|
||||
class IDNABidiError(IDNAError):
|
||||
""" Exception when bidirectional requirements are not satisfied """
|
||||
pass
|
||||
|
||||
|
||||
class InvalidCodepoint(IDNAError):
|
||||
""" Exception when a disallowed or unallocated codepoint is used """
|
||||
pass
|
||||
|
||||
|
||||
class InvalidCodepointContext(IDNAError):
|
||||
""" Exception when the codepoint is not valid in the context it is used """
|
||||
pass
|
||||
|
||||
|
||||
def _combining_class(cp: int) -> int:
|
||||
v = unicodedata.combining(chr(cp))
|
||||
if v == 0:
|
||||
if not unicodedata.name(chr(cp)):
|
||||
raise ValueError('Unknown character in unicodedata')
|
||||
return v
|
||||
|
||||
def _is_script(cp: str, script: str) -> bool:
|
||||
return intranges_contain(ord(cp), idnadata.scripts[script])
|
||||
|
||||
def _punycode(s: str) -> bytes:
|
||||
return s.encode('punycode')
|
||||
|
||||
def _unot(s: int) -> str:
|
||||
return 'U+{:04X}'.format(s)
|
||||
|
||||
|
||||
def valid_label_length(label: Union[bytes, str]) -> bool:
|
||||
if len(label) > 63:
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def valid_string_length(label: Union[bytes, str], trailing_dot: bool) -> bool:
|
||||
if len(label) > (254 if trailing_dot else 253):
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def check_bidi(label: str, check_ltr: bool = False) -> bool:
|
||||
# Bidi rules should only be applied if string contains RTL characters
|
||||
bidi_label = False
|
||||
for (idx, cp) in enumerate(label, 1):
|
||||
direction = unicodedata.bidirectional(cp)
|
||||
if direction == '':
|
||||
# String likely comes from a newer version of Unicode
|
||||
raise IDNABidiError('Unknown directionality in label {} at position {}'.format(repr(label), idx))
|
||||
if direction in ['R', 'AL', 'AN']:
|
||||
bidi_label = True
|
||||
if not bidi_label and not check_ltr:
|
||||
return True
|
||||
|
||||
# Bidi rule 1
|
||||
direction = unicodedata.bidirectional(label[0])
|
||||
if direction in ['R', 'AL']:
|
||||
rtl = True
|
||||
elif direction == 'L':
|
||||
rtl = False
|
||||
else:
|
||||
raise IDNABidiError('First codepoint in label {} must be directionality L, R or AL'.format(repr(label)))
|
||||
|
||||
valid_ending = False
|
||||
number_type = None # type: Optional[str]
|
||||
for (idx, cp) in enumerate(label, 1):
|
||||
direction = unicodedata.bidirectional(cp)
|
||||
|
||||
if rtl:
|
||||
# Bidi rule 2
|
||||
if not direction in ['R', 'AL', 'AN', 'EN', 'ES', 'CS', 'ET', 'ON', 'BN', 'NSM']:
|
||||
raise IDNABidiError('Invalid direction for codepoint at position {} in a right-to-left label'.format(idx))
|
||||
# Bidi rule 3
|
||||
if direction in ['R', 'AL', 'EN', 'AN']:
|
||||
valid_ending = True
|
||||
elif direction != 'NSM':
|
||||
valid_ending = False
|
||||
# Bidi rule 4
|
||||
if direction in ['AN', 'EN']:
|
||||
if not number_type:
|
||||
number_type = direction
|
||||
else:
|
||||
if number_type != direction:
|
||||
raise IDNABidiError('Can not mix numeral types in a right-to-left label')
|
||||
else:
|
||||
# Bidi rule 5
|
||||
if not direction in ['L', 'EN', 'ES', 'CS', 'ET', 'ON', 'BN', 'NSM']:
|
||||
raise IDNABidiError('Invalid direction for codepoint at position {} in a left-to-right label'.format(idx))
|
||||
# Bidi rule 6
|
||||
if direction in ['L', 'EN']:
|
||||
valid_ending = True
|
||||
elif direction != 'NSM':
|
||||
valid_ending = False
|
||||
|
||||
if not valid_ending:
|
||||
raise IDNABidiError('Label ends with illegal codepoint directionality')
|
||||
|
||||
return True
|
||||
|
||||
|
||||
def check_initial_combiner(label: str) -> bool:
|
||||
if unicodedata.category(label[0])[0] == 'M':
|
||||
raise IDNAError('Label begins with an illegal combining character')
|
||||
return True
|
||||
|
||||
|
||||
def check_hyphen_ok(label: str) -> bool:
|
||||
if label[2:4] == '--':
|
||||
raise IDNAError('Label has disallowed hyphens in 3rd and 4th position')
|
||||
if label[0] == '-' or label[-1] == '-':
|
||||
raise IDNAError('Label must not start or end with a hyphen')
|
||||
return True
|
||||
|
||||
|
||||
def check_nfc(label: str) -> None:
|
||||
if unicodedata.normalize('NFC', label) != label:
|
||||
raise IDNAError('Label must be in Normalization Form C')
|
||||
|
||||
|
||||
def valid_contextj(label: str, pos: int) -> bool:
|
||||
cp_value = ord(label[pos])
|
||||
|
||||
if cp_value == 0x200c:
|
||||
|
||||
if pos > 0:
|
||||
if _combining_class(ord(label[pos - 1])) == _virama_combining_class:
|
||||
return True
|
||||
|
||||
ok = False
|
||||
for i in range(pos-1, -1, -1):
|
||||
joining_type = idnadata.joining_types.get(ord(label[i]))
|
||||
if joining_type == ord('T'):
|
||||
continue
|
||||
if joining_type in [ord('L'), ord('D')]:
|
||||
ok = True
|
||||
break
|
||||
|
||||
if not ok:
|
||||
return False
|
||||
|
||||
ok = False
|
||||
for i in range(pos+1, len(label)):
|
||||
joining_type = idnadata.joining_types.get(ord(label[i]))
|
||||
if joining_type == ord('T'):
|
||||
continue
|
||||
if joining_type in [ord('R'), ord('D')]:
|
||||
ok = True
|
||||
break
|
||||
return ok
|
||||
|
||||
if cp_value == 0x200d:
|
||||
|
||||
if pos > 0:
|
||||
if _combining_class(ord(label[pos - 1])) == _virama_combining_class:
|
||||
return True
|
||||
return False
|
||||
|
||||
else:
|
||||
|
||||
return False
|
||||
|
||||
|
||||
def valid_contexto(label: str, pos: int, exception: bool = False) -> bool:
|
||||
cp_value = ord(label[pos])
|
||||
|
||||
if cp_value == 0x00b7:
|
||||
if 0 < pos < len(label)-1:
|
||||
if ord(label[pos - 1]) == 0x006c and ord(label[pos + 1]) == 0x006c:
|
||||
return True
|
||||
return False
|
||||
|
||||
elif cp_value == 0x0375:
|
||||
if pos < len(label)-1 and len(label) > 1:
|
||||
return _is_script(label[pos + 1], 'Greek')
|
||||
return False
|
||||
|
||||
elif cp_value == 0x05f3 or cp_value == 0x05f4:
|
||||
if pos > 0:
|
||||
return _is_script(label[pos - 1], 'Hebrew')
|
||||
return False
|
||||
|
||||
elif cp_value == 0x30fb:
|
||||
for cp in label:
|
||||
if cp == '\u30fb':
|
||||
continue
|
||||
if _is_script(cp, 'Hiragana') or _is_script(cp, 'Katakana') or _is_script(cp, 'Han'):
|
||||
return True
|
||||
return False
|
||||
|
||||
elif 0x660 <= cp_value <= 0x669:
|
||||
for cp in label:
|
||||
if 0x6f0 <= ord(cp) <= 0x06f9:
|
||||
return False
|
||||
return True
|
||||
|
||||
elif 0x6f0 <= cp_value <= 0x6f9:
|
||||
for cp in label:
|
||||
if 0x660 <= ord(cp) <= 0x0669:
|
||||
return False
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
|
||||
def check_label(label: Union[str, bytes, bytearray]) -> None:
|
||||
if isinstance(label, (bytes, bytearray)):
|
||||
label = label.decode('utf-8')
|
||||
if len(label) == 0:
|
||||
raise IDNAError('Empty Label')
|
||||
|
||||
check_nfc(label)
|
||||
check_hyphen_ok(label)
|
||||
check_initial_combiner(label)
|
||||
|
||||
for (pos, cp) in enumerate(label):
|
||||
cp_value = ord(cp)
|
||||
if intranges_contain(cp_value, idnadata.codepoint_classes['PVALID']):
|
||||
continue
|
||||
elif intranges_contain(cp_value, idnadata.codepoint_classes['CONTEXTJ']):
|
||||
try:
|
||||
if not valid_contextj(label, pos):
|
||||
raise InvalidCodepointContext('Joiner {} not allowed at position {} in {}'.format(
|
||||
_unot(cp_value), pos+1, repr(label)))
|
||||
except ValueError:
|
||||
raise IDNAError('Unknown codepoint adjacent to joiner {} at position {} in {}'.format(
|
||||
_unot(cp_value), pos+1, repr(label)))
|
||||
elif intranges_contain(cp_value, idnadata.codepoint_classes['CONTEXTO']):
|
||||
if not valid_contexto(label, pos):
|
||||
raise InvalidCodepointContext('Codepoint {} not allowed at position {} in {}'.format(_unot(cp_value), pos+1, repr(label)))
|
||||
else:
|
||||
raise InvalidCodepoint('Codepoint {} at position {} of {} not allowed'.format(_unot(cp_value), pos+1, repr(label)))
|
||||
|
||||
check_bidi(label)
|
||||
|
||||
|
||||
def alabel(label: str) -> bytes:
|
||||
try:
|
||||
label_bytes = label.encode('ascii')
|
||||
ulabel(label_bytes)
|
||||
if not valid_label_length(label_bytes):
|
||||
raise IDNAError('Label too long')
|
||||
return label_bytes
|
||||
except UnicodeEncodeError:
|
||||
pass
|
||||
|
||||
if not label:
|
||||
raise IDNAError('No Input')
|
||||
|
||||
label = str(label)
|
||||
check_label(label)
|
||||
label_bytes = _punycode(label)
|
||||
label_bytes = _alabel_prefix + label_bytes
|
||||
|
||||
if not valid_label_length(label_bytes):
|
||||
raise IDNAError('Label too long')
|
||||
|
||||
return label_bytes
|
||||
|
||||
|
||||
def ulabel(label: Union[str, bytes, bytearray]) -> str:
|
||||
if not isinstance(label, (bytes, bytearray)):
|
||||
try:
|
||||
label_bytes = label.encode('ascii')
|
||||
except UnicodeEncodeError:
|
||||
check_label(label)
|
||||
return label
|
||||
else:
|
||||
label_bytes = label
|
||||
|
||||
label_bytes = label_bytes.lower()
|
||||
if label_bytes.startswith(_alabel_prefix):
|
||||
label_bytes = label_bytes[len(_alabel_prefix):]
|
||||
if not label_bytes:
|
||||
raise IDNAError('Malformed A-label, no Punycode eligible content found')
|
||||
if label_bytes.decode('ascii')[-1] == '-':
|
||||
raise IDNAError('A-label must not end with a hyphen')
|
||||
else:
|
||||
check_label(label_bytes)
|
||||
return label_bytes.decode('ascii')
|
||||
|
||||
try:
|
||||
label = label_bytes.decode('punycode')
|
||||
except UnicodeError:
|
||||
raise IDNAError('Invalid A-label')
|
||||
check_label(label)
|
||||
return label
|
||||
|
||||
|
||||
def uts46_remap(domain: str, std3_rules: bool = True, transitional: bool = False) -> str:
|
||||
"""Re-map the characters in the string according to UTS46 processing."""
|
||||
from .uts46data import uts46data
|
||||
output = ''
|
||||
|
||||
for pos, char in enumerate(domain):
|
||||
code_point = ord(char)
|
||||
try:
|
||||
uts46row = uts46data[code_point if code_point < 256 else
|
||||
bisect.bisect_left(uts46data, (code_point, 'Z')) - 1]
|
||||
status = uts46row[1]
|
||||
replacement = None # type: Optional[str]
|
||||
if len(uts46row) == 3:
|
||||
replacement = uts46row[2] # type: ignore
|
||||
if (status == 'V' or
|
||||
(status == 'D' and not transitional) or
|
||||
(status == '3' and not std3_rules and replacement is None)):
|
||||
output += char
|
||||
elif replacement is not None and (status == 'M' or
|
||||
(status == '3' and not std3_rules) or
|
||||
(status == 'D' and transitional)):
|
||||
output += replacement
|
||||
elif status != 'I':
|
||||
raise IndexError()
|
||||
except IndexError:
|
||||
raise InvalidCodepoint(
|
||||
'Codepoint {} not allowed at position {} in {}'.format(
|
||||
_unot(code_point), pos + 1, repr(domain)))
|
||||
|
||||
return unicodedata.normalize('NFC', output)
|
||||
|
||||
|
||||
def encode(s: Union[str, bytes, bytearray], strict: bool = False, uts46: bool = False, std3_rules: bool = False, transitional: bool = False) -> bytes:
|
||||
if isinstance(s, (bytes, bytearray)):
|
||||
try:
|
||||
s = s.decode('ascii')
|
||||
except UnicodeDecodeError:
|
||||
raise IDNAError('should pass a unicode string to the function rather than a byte string.')
|
||||
if uts46:
|
||||
s = uts46_remap(s, std3_rules, transitional)
|
||||
trailing_dot = False
|
||||
result = []
|
||||
if strict:
|
||||
labels = s.split('.')
|
||||
else:
|
||||
labels = _unicode_dots_re.split(s)
|
||||
if not labels or labels == ['']:
|
||||
raise IDNAError('Empty domain')
|
||||
if labels[-1] == '':
|
||||
del labels[-1]
|
||||
trailing_dot = True
|
||||
for label in labels:
|
||||
s = alabel(label)
|
||||
if s:
|
||||
result.append(s)
|
||||
else:
|
||||
raise IDNAError('Empty label')
|
||||
if trailing_dot:
|
||||
result.append(b'')
|
||||
s = b'.'.join(result)
|
||||
if not valid_string_length(s, trailing_dot):
|
||||
raise IDNAError('Domain too long')
|
||||
return s
|
||||
|
||||
|
||||
def decode(s: Union[str, bytes, bytearray], strict: bool = False, uts46: bool = False, std3_rules: bool = False) -> str:
|
||||
try:
|
||||
if isinstance(s, (bytes, bytearray)):
|
||||
s = s.decode('ascii')
|
||||
except UnicodeDecodeError:
|
||||
raise IDNAError('Invalid ASCII in A-label')
|
||||
if uts46:
|
||||
s = uts46_remap(s, std3_rules, False)
|
||||
trailing_dot = False
|
||||
result = []
|
||||
if not strict:
|
||||
labels = _unicode_dots_re.split(s)
|
||||
else:
|
||||
labels = s.split('.')
|
||||
if not labels or labels == ['']:
|
||||
raise IDNAError('Empty domain')
|
||||
if not labels[-1]:
|
||||
del labels[-1]
|
||||
trailing_dot = True
|
||||
for label in labels:
|
||||
s = ulabel(label)
|
||||
if s:
|
||||
result.append(s)
|
||||
else:
|
||||
raise IDNAError('Empty label')
|
||||
if trailing_dot:
|
||||
result.append('')
|
||||
return '.'.join(result)
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,54 @@
|
||||
"""
|
||||
Given a list of integers, made up of (hopefully) a small number of long runs
|
||||
of consecutive integers, compute a representation of the form
|
||||
((start1, end1), (start2, end2) ...). Then answer the question "was x present
|
||||
in the original list?" in time O(log(# runs)).
|
||||
"""
|
||||
|
||||
import bisect
|
||||
from typing import List, Tuple
|
||||
|
||||
def intranges_from_list(list_: List[int]) -> Tuple[int, ...]:
|
||||
"""Represent a list of integers as a sequence of ranges:
|
||||
((start_0, end_0), (start_1, end_1), ...), such that the original
|
||||
integers are exactly those x such that start_i <= x < end_i for some i.
|
||||
|
||||
Ranges are encoded as single integers (start << 32 | end), not as tuples.
|
||||
"""
|
||||
|
||||
sorted_list = sorted(list_)
|
||||
ranges = []
|
||||
last_write = -1
|
||||
for i in range(len(sorted_list)):
|
||||
if i+1 < len(sorted_list):
|
||||
if sorted_list[i] == sorted_list[i+1]-1:
|
||||
continue
|
||||
current_range = sorted_list[last_write+1:i+1]
|
||||
ranges.append(_encode_range(current_range[0], current_range[-1] + 1))
|
||||
last_write = i
|
||||
|
||||
return tuple(ranges)
|
||||
|
||||
def _encode_range(start: int, end: int) -> int:
|
||||
return (start << 32) | end
|
||||
|
||||
def _decode_range(r: int) -> Tuple[int, int]:
|
||||
return (r >> 32), (r & ((1 << 32) - 1))
|
||||
|
||||
|
||||
def intranges_contain(int_: int, ranges: Tuple[int, ...]) -> bool:
|
||||
"""Determine if `int_` falls into one of the ranges in `ranges`."""
|
||||
tuple_ = _encode_range(int_, 0)
|
||||
pos = bisect.bisect_left(ranges, tuple_)
|
||||
# we could be immediately ahead of a tuple (start, end)
|
||||
# with start < int_ <= end
|
||||
if pos > 0:
|
||||
left, right = _decode_range(ranges[pos-1])
|
||||
if left <= int_ < right:
|
||||
return True
|
||||
# or we could be immediately behind a tuple (int_, end)
|
||||
if pos < len(ranges):
|
||||
left, _ = _decode_range(ranges[pos])
|
||||
if left == int_:
|
||||
return True
|
||||
return False
|
||||
@@ -0,0 +1,2 @@
|
||||
__version__ = '3.4'
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1 @@
|
||||
pip
|
||||
@@ -0,0 +1,175 @@
|
||||
|
||||
Apache License
|
||||
Version 2.0, January 2004
|
||||
http://www.apache.org/licenses/
|
||||
|
||||
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
||||
|
||||
1. Definitions.
|
||||
|
||||
"License" shall mean the terms and conditions for use, reproduction,
|
||||
and distribution as defined by Sections 1 through 9 of this document.
|
||||
|
||||
"Licensor" shall mean the copyright owner or entity authorized by
|
||||
the copyright owner that is granting the License.
|
||||
|
||||
"Legal Entity" shall mean the union of the acting entity and all
|
||||
other entities that control, are controlled by, or are under common
|
||||
control with that entity. For the purposes of this definition,
|
||||
"control" means (i) the power, direct or indirect, to cause the
|
||||
direction or management of such entity, whether by contract or
|
||||
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
||||
outstanding shares, or (iii) beneficial ownership of such entity.
|
||||
|
||||
"You" (or "Your") shall mean an individual or Legal Entity
|
||||
exercising permissions granted by this License.
|
||||
|
||||
"Source" form shall mean the preferred form for making modifications,
|
||||
including but not limited to software source code, documentation
|
||||
source, and configuration files.
|
||||
|
||||
"Object" form shall mean any form resulting from mechanical
|
||||
transformation or translation of a Source form, including but
|
||||
not limited to compiled object code, generated documentation,
|
||||
and conversions to other media types.
|
||||
|
||||
"Work" shall mean the work of authorship, whether in Source or
|
||||
Object form, made available under the License, as indicated by a
|
||||
copyright notice that is included in or attached to the work
|
||||
(an example is provided in the Appendix below).
|
||||
|
||||
"Derivative Works" shall mean any work, whether in Source or Object
|
||||
form, that is based on (or derived from) the Work and for which the
|
||||
editorial revisions, annotations, elaborations, or other modifications
|
||||
represent, as a whole, an original work of authorship. For the purposes
|
||||
of this License, Derivative Works shall not include works that remain
|
||||
separable from, or merely link (or bind by name) to the interfaces of,
|
||||
the Work and Derivative Works thereof.
|
||||
|
||||
"Contribution" shall mean any work of authorship, including
|
||||
the original version of the Work and any modifications or additions
|
||||
to that Work or Derivative Works thereof, that is intentionally
|
||||
submitted to Licensor for inclusion in the Work by the copyright owner
|
||||
or by an individual or Legal Entity authorized to submit on behalf of
|
||||
the copyright owner. For the purposes of this definition, "submitted"
|
||||
means any form of electronic, verbal, or written communication sent
|
||||
to the Licensor or its representatives, including but not limited to
|
||||
communication on electronic mailing lists, source code control systems,
|
||||
and issue tracking systems that are managed by, or on behalf of, the
|
||||
Licensor for the purpose of discussing and improving the Work, but
|
||||
excluding communication that is conspicuously marked or otherwise
|
||||
designated in writing by the copyright owner as "Not a Contribution."
|
||||
|
||||
"Contributor" shall mean Licensor and any individual or Legal Entity
|
||||
on behalf of whom a Contribution has been received by Licensor and
|
||||
subsequently incorporated within the Work.
|
||||
|
||||
2. Grant of Copyright License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
copyright license to reproduce, prepare Derivative Works of,
|
||||
publicly display, publicly perform, sublicense, and distribute the
|
||||
Work and such Derivative Works in Source or Object form.
|
||||
|
||||
3. Grant of Patent License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
(except as stated in this section) patent license to make, have made,
|
||||
use, offer to sell, sell, import, and otherwise transfer the Work,
|
||||
where such license applies only to those patent claims licensable
|
||||
by such Contributor that are necessarily infringed by their
|
||||
Contribution(s) alone or by combination of their Contribution(s)
|
||||
with the Work to which such Contribution(s) was submitted. If You
|
||||
institute patent litigation against any entity (including a
|
||||
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
||||
or a Contribution incorporated within the Work constitutes direct
|
||||
or contributory patent infringement, then any patent licenses
|
||||
granted to You under this License for that Work shall terminate
|
||||
as of the date such litigation is filed.
|
||||
|
||||
4. Redistribution. You may reproduce and distribute copies of the
|
||||
Work or Derivative Works thereof in any medium, with or without
|
||||
modifications, and in Source or Object form, provided that You
|
||||
meet the following conditions:
|
||||
|
||||
(a) You must give any other recipients of the Work or
|
||||
Derivative Works a copy of this License; and
|
||||
|
||||
(b) You must cause any modified files to carry prominent notices
|
||||
stating that You changed the files; and
|
||||
|
||||
(c) You must retain, in the Source form of any Derivative Works
|
||||
that You distribute, all copyright, patent, trademark, and
|
||||
attribution notices from the Source form of the Work,
|
||||
excluding those notices that do not pertain to any part of
|
||||
the Derivative Works; and
|
||||
|
||||
(d) If the Work includes a "NOTICE" text file as part of its
|
||||
distribution, then any Derivative Works that You distribute must
|
||||
include a readable copy of the attribution notices contained
|
||||
within such NOTICE file, excluding those notices that do not
|
||||
pertain to any part of the Derivative Works, in at least one
|
||||
of the following places: within a NOTICE text file distributed
|
||||
as part of the Derivative Works; within the Source form or
|
||||
documentation, if provided along with the Derivative Works; or,
|
||||
within a display generated by the Derivative Works, if and
|
||||
wherever such third-party notices normally appear. The contents
|
||||
of the NOTICE file are for informational purposes only and
|
||||
do not modify the License. You may add Your own attribution
|
||||
notices within Derivative Works that You distribute, alongside
|
||||
or as an addendum to the NOTICE text from the Work, provided
|
||||
that such additional attribution notices cannot be construed
|
||||
as modifying the License.
|
||||
|
||||
You may add Your own copyright statement to Your modifications and
|
||||
may provide additional or different license terms and conditions
|
||||
for use, reproduction, or distribution of Your modifications, or
|
||||
for any such Derivative Works as a whole, provided Your use,
|
||||
reproduction, and distribution of the Work otherwise complies with
|
||||
the conditions stated in this License.
|
||||
|
||||
5. Submission of Contributions. Unless You explicitly state otherwise,
|
||||
any Contribution intentionally submitted for inclusion in the Work
|
||||
by You to the Licensor shall be under the terms and conditions of
|
||||
this License, without any additional terms or conditions.
|
||||
Notwithstanding the above, nothing herein shall supersede or modify
|
||||
the terms of any separate license agreement you may have executed
|
||||
with Licensor regarding such Contributions.
|
||||
|
||||
6. Trademarks. This License does not grant permission to use the trade
|
||||
names, trademarks, service marks, or product names of the Licensor,
|
||||
except as required for reasonable and customary use in describing the
|
||||
origin of the Work and reproducing the content of the NOTICE file.
|
||||
|
||||
7. Disclaimer of Warranty. Unless required by applicable law or
|
||||
agreed to in writing, Licensor provides the Work (and each
|
||||
Contributor provides its Contributions) on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
||||
implied, including, without limitation, any warranties or conditions
|
||||
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
||||
PARTICULAR PURPOSE. You are solely responsible for determining the
|
||||
appropriateness of using or redistributing the Work and assume any
|
||||
risks associated with Your exercise of permissions under this License.
|
||||
|
||||
8. Limitation of Liability. In no event and under no legal theory,
|
||||
whether in tort (including negligence), contract, or otherwise,
|
||||
unless required by applicable law (such as deliberate and grossly
|
||||
negligent acts) or agreed to in writing, shall any Contributor be
|
||||
liable to You for damages, including any direct, indirect, special,
|
||||
incidental, or consequential damages of any character arising as a
|
||||
result of this License or out of the use or inability to use the
|
||||
Work (including but not limited to damages for loss of goodwill,
|
||||
work stoppage, computer failure or malfunction, or any and all
|
||||
other commercial damages or losses), even if such Contributor
|
||||
has been advised of the possibility of such damages.
|
||||
|
||||
9. Accepting Warranty or Additional Liability. While redistributing
|
||||
the Work or Derivative Works thereof, You may choose to offer,
|
||||
and charge a fee for, acceptance of support, warranty, indemnity,
|
||||
or other liability obligations and/or rights consistent with this
|
||||
License. However, in accepting such obligations, You may act only
|
||||
on Your own behalf and on Your sole responsibility, not on behalf
|
||||
of any other Contributor, and only if You agree to indemnify,
|
||||
defend, and hold each Contributor harmless for any liability
|
||||
incurred by, or claims asserted against, such Contributor by reason
|
||||
of your accepting any such warranty or additional liability.
|
||||
@@ -0,0 +1,122 @@
|
||||
Metadata-Version: 2.1
|
||||
Name: requests
|
||||
Version: 2.28.1
|
||||
Summary: Python HTTP for Humans.
|
||||
Home-page: https://requests.readthedocs.io
|
||||
Author: Kenneth Reitz
|
||||
Author-email: me@kennethreitz.org
|
||||
License: Apache 2.0
|
||||
Project-URL: Documentation, https://requests.readthedocs.io
|
||||
Project-URL: Source, https://github.com/psf/requests
|
||||
Platform: UNKNOWN
|
||||
Classifier: Development Status :: 5 - Production/Stable
|
||||
Classifier: Environment :: Web Environment
|
||||
Classifier: Intended Audience :: Developers
|
||||
Classifier: License :: OSI Approved :: Apache Software License
|
||||
Classifier: Natural Language :: English
|
||||
Classifier: Operating System :: OS Independent
|
||||
Classifier: Programming Language :: Python
|
||||
Classifier: Programming Language :: Python :: 3
|
||||
Classifier: Programming Language :: Python :: 3.7
|
||||
Classifier: Programming Language :: Python :: 3.8
|
||||
Classifier: Programming Language :: Python :: 3.9
|
||||
Classifier: Programming Language :: Python :: 3.10
|
||||
Classifier: Programming Language :: Python :: 3.11
|
||||
Classifier: Programming Language :: Python :: 3 :: Only
|
||||
Classifier: Programming Language :: Python :: Implementation :: CPython
|
||||
Classifier: Programming Language :: Python :: Implementation :: PyPy
|
||||
Classifier: Topic :: Internet :: WWW/HTTP
|
||||
Classifier: Topic :: Software Development :: Libraries
|
||||
Requires-Python: >=3.7, <4
|
||||
Description-Content-Type: text/markdown
|
||||
License-File: LICENSE
|
||||
Requires-Dist: charset-normalizer (<3,>=2)
|
||||
Requires-Dist: idna (<4,>=2.5)
|
||||
Requires-Dist: urllib3 (<1.27,>=1.21.1)
|
||||
Requires-Dist: certifi (>=2017.4.17)
|
||||
Provides-Extra: security
|
||||
Provides-Extra: socks
|
||||
Requires-Dist: PySocks (!=1.5.7,>=1.5.6) ; extra == 'socks'
|
||||
Provides-Extra: use_chardet_on_py3
|
||||
Requires-Dist: chardet (<6,>=3.0.2) ; extra == 'use_chardet_on_py3'
|
||||
|
||||
# Requests
|
||||
|
||||
**Requests** is a simple, yet elegant, HTTP library.
|
||||
|
||||
```python
|
||||
>>> import requests
|
||||
>>> r = requests.get('https://httpbin.org/basic-auth/user/pass', auth=('user', 'pass'))
|
||||
>>> r.status_code
|
||||
200
|
||||
>>> r.headers['content-type']
|
||||
'application/json; charset=utf8'
|
||||
>>> r.encoding
|
||||
'utf-8'
|
||||
>>> r.text
|
||||
'{"authenticated": true, ...'
|
||||
>>> r.json()
|
||||
{'authenticated': True, ...}
|
||||
```
|
||||
|
||||
Requests allows you to send HTTP/1.1 requests extremely easily. There’s no need to manually add query strings to your URLs, or to form-encode your `PUT` & `POST` data — but nowadays, just use the `json` method!
|
||||
|
||||
Requests is one of the most downloaded Python packages today, pulling in around `30M downloads / week`— according to GitHub, Requests is currently [depended upon](https://github.com/psf/requests/network/dependents?package_id=UGFja2FnZS01NzA4OTExNg%3D%3D) by `1,000,000+` repositories. You may certainly put your trust in this code.
|
||||
|
||||
[](https://pepy.tech/project/requests)
|
||||
[](https://pypi.org/project/requests)
|
||||
[](https://github.com/psf/requests/graphs/contributors)
|
||||
|
||||
## Installing Requests and Supported Versions
|
||||
|
||||
Requests is available on PyPI:
|
||||
|
||||
```console
|
||||
$ python -m pip install requests
|
||||
```
|
||||
|
||||
Requests officially supports Python 3.7+.
|
||||
|
||||
## Supported Features & Best–Practices
|
||||
|
||||
Requests is ready for the demands of building robust and reliable HTTP–speaking applications, for the needs of today.
|
||||
|
||||
- Keep-Alive & Connection Pooling
|
||||
- International Domains and URLs
|
||||
- Sessions with Cookie Persistence
|
||||
- Browser-style TLS/SSL Verification
|
||||
- Basic & Digest Authentication
|
||||
- Familiar `dict`–like Cookies
|
||||
- Automatic Content Decompression and Decoding
|
||||
- Multi-part File Uploads
|
||||
- SOCKS Proxy Support
|
||||
- Connection Timeouts
|
||||
- Streaming Downloads
|
||||
- Automatic honoring of `.netrc`
|
||||
- Chunked HTTP Requests
|
||||
|
||||
## API Reference and User Guide available on [Read the Docs](https://requests.readthedocs.io)
|
||||
|
||||
[](https://requests.readthedocs.io)
|
||||
|
||||
## Cloning the repository
|
||||
|
||||
When cloning the Requests repository, you may need to add the `-c
|
||||
fetch.fsck.badTimezone=ignore` flag to avoid an error about a bad commit (see
|
||||
[this issue](https://github.com/psf/requests/issues/2690) for more background):
|
||||
|
||||
```shell
|
||||
git clone -c fetch.fsck.badTimezone=ignore https://github.com/psf/requests.git
|
||||
```
|
||||
|
||||
You can also apply this setting to your global Git config:
|
||||
|
||||
```shell
|
||||
git config --global fetch.fsck.badTimezone ignore
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
[](https://kennethreitz.org) [](https://www.python.org/psf)
|
||||
|
||||
|
||||
@@ -0,0 +1,43 @@
|
||||
requests-2.28.1.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
|
||||
requests-2.28.1.dist-info/LICENSE,sha256=CeipvOyAZxBGUsFoaFqwkx54aPnIKEtm9a5u2uXxEws,10142
|
||||
requests-2.28.1.dist-info/METADATA,sha256=eoNYSJuPWbql7Til9dKPb--KRU2ouGR4g7UxtZFoHNU,4641
|
||||
requests-2.28.1.dist-info/RECORD,,
|
||||
requests-2.28.1.dist-info/REQUESTED,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
||||
requests-2.28.1.dist-info/WHEEL,sha256=G16H4A3IeoQmnOrYV4ueZGKSjhipXx8zc8nu9FGlvMA,92
|
||||
requests-2.28.1.dist-info/top_level.txt,sha256=fMSVmHfb5rbGOo6xv-O_tUX6j-WyixssE-SnwcDRxNQ,9
|
||||
requests/__init__.py,sha256=S2K0jnVP6CSrT51SctFyiB0XfI8H9Nt7EqzERAD44gg,4972
|
||||
requests/__pycache__/__init__.cpython-310.pyc,,
|
||||
requests/__pycache__/__version__.cpython-310.pyc,,
|
||||
requests/__pycache__/_internal_utils.cpython-310.pyc,,
|
||||
requests/__pycache__/adapters.cpython-310.pyc,,
|
||||
requests/__pycache__/api.cpython-310.pyc,,
|
||||
requests/__pycache__/auth.cpython-310.pyc,,
|
||||
requests/__pycache__/certs.cpython-310.pyc,,
|
||||
requests/__pycache__/compat.cpython-310.pyc,,
|
||||
requests/__pycache__/cookies.cpython-310.pyc,,
|
||||
requests/__pycache__/exceptions.cpython-310.pyc,,
|
||||
requests/__pycache__/help.cpython-310.pyc,,
|
||||
requests/__pycache__/hooks.cpython-310.pyc,,
|
||||
requests/__pycache__/models.cpython-310.pyc,,
|
||||
requests/__pycache__/packages.cpython-310.pyc,,
|
||||
requests/__pycache__/sessions.cpython-310.pyc,,
|
||||
requests/__pycache__/status_codes.cpython-310.pyc,,
|
||||
requests/__pycache__/structures.cpython-310.pyc,,
|
||||
requests/__pycache__/utils.cpython-310.pyc,,
|
||||
requests/__version__.py,sha256=nJVa3ef2yRyeYMhy7yHnRyjjpnNTDykZsE4Sp9irBC4,440
|
||||
requests/_internal_utils.py,sha256=aSPlF4uDhtfKxEayZJJ7KkAxtormeTfpwKSBSwtmAUw,1397
|
||||
requests/adapters.py,sha256=sEnHGl4mJz4QHBT8jG6bU5aPinUtdoH3BIuAIzT-X74,21287
|
||||
requests/api.py,sha256=dyvkDd5itC9z2g0wHl_YfD1yf6YwpGWLO7__8e21nks,6377
|
||||
requests/auth.py,sha256=h-HLlVx9j8rKV5hfSAycP2ApOSglTz77R0tz7qCbbEE,10187
|
||||
requests/certs.py,sha256=Z9Sb410Anv6jUFTyss0jFFhU6xst8ctELqfy8Ev23gw,429
|
||||
requests/compat.py,sha256=yxntVOSEHGMrn7FNr_32EEam1ZNAdPRdSE13_yaHzTk,1451
|
||||
requests/cookies.py,sha256=kD3kNEcCj-mxbtf5fJsSaT86eGoEYpD3X0CSgpzl7BM,18560
|
||||
requests/exceptions.py,sha256=DhveFBclVjTRxhRduVpO-GbMYMID2gmjdLfNEqNpI_U,3811
|
||||
requests/help.py,sha256=gPX5d_H7Xd88aDABejhqGgl9B1VFRTt5BmiYvL3PzIQ,3875
|
||||
requests/hooks.py,sha256=CiuysiHA39V5UfcCBXFIx83IrDpuwfN9RcTUgv28ftQ,733
|
||||
requests/models.py,sha256=OiVxiOdlhzpbZoxut2OhKtpYlB7WW4iHQcfqSVmT4H4,35222
|
||||
requests/packages.py,sha256=DXgv-FJIczZITmv0vEBAhWj4W-5CGCIN_ksvgR17Dvs,957
|
||||
requests/sessions.py,sha256=KUqJcRRLovNefUs7ScOXSUVCcfSayTFWtbiJ7gOSlTI,30180
|
||||
requests/status_codes.py,sha256=FvHmT5uH-_uimtRz5hH9VCbt7VV-Nei2J9upbej6j8g,4235
|
||||
requests/structures.py,sha256=-IbmhVz06S-5aPSZuUthZ6-6D9XOjRuTXHOabY041XM,2912
|
||||
requests/utils.py,sha256=5_ws-bsKI9EHl7j27yi-6HFzPBKultPgd7HfPrUToWI,33228
|
||||
@@ -0,0 +1,5 @@
|
||||
Wheel-Version: 1.0
|
||||
Generator: bdist_wheel (0.37.1)
|
||||
Root-Is-Purelib: true
|
||||
Tag: py3-none-any
|
||||
|
||||
@@ -0,0 +1 @@
|
||||
requests
|
||||
@@ -0,0 +1,180 @@
|
||||
# __
|
||||
# /__) _ _ _ _ _/ _
|
||||
# / ( (- (/ (/ (- _) / _)
|
||||
# /
|
||||
|
||||
"""
|
||||
Requests HTTP Library
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
Requests is an HTTP library, written in Python, for human beings.
|
||||
Basic GET usage:
|
||||
|
||||
>>> import requests
|
||||
>>> r = requests.get('https://www.python.org')
|
||||
>>> r.status_code
|
||||
200
|
||||
>>> b'Python is a programming language' in r.content
|
||||
True
|
||||
|
||||
... or POST:
|
||||
|
||||
>>> payload = dict(key1='value1', key2='value2')
|
||||
>>> r = requests.post('https://httpbin.org/post', data=payload)
|
||||
>>> print(r.text)
|
||||
{
|
||||
...
|
||||
"form": {
|
||||
"key1": "value1",
|
||||
"key2": "value2"
|
||||
},
|
||||
...
|
||||
}
|
||||
|
||||
The other HTTP methods are supported - see `requests.api`. Full documentation
|
||||
is at <https://requests.readthedocs.io>.
|
||||
|
||||
:copyright: (c) 2017 by Kenneth Reitz.
|
||||
:license: Apache 2.0, see LICENSE for more details.
|
||||
"""
|
||||
|
||||
import warnings
|
||||
|
||||
import urllib3
|
||||
|
||||
from .exceptions import RequestsDependencyWarning
|
||||
|
||||
try:
|
||||
from charset_normalizer import __version__ as charset_normalizer_version
|
||||
except ImportError:
|
||||
charset_normalizer_version = None
|
||||
|
||||
try:
|
||||
from chardet import __version__ as chardet_version
|
||||
except ImportError:
|
||||
chardet_version = None
|
||||
|
||||
|
||||
def check_compatibility(urllib3_version, chardet_version, charset_normalizer_version):
|
||||
urllib3_version = urllib3_version.split(".")
|
||||
assert urllib3_version != ["dev"] # Verify urllib3 isn't installed from git.
|
||||
|
||||
# Sometimes, urllib3 only reports its version as 16.1.
|
||||
if len(urllib3_version) == 2:
|
||||
urllib3_version.append("0")
|
||||
|
||||
# Check urllib3 for compatibility.
|
||||
major, minor, patch = urllib3_version # noqa: F811
|
||||
major, minor, patch = int(major), int(minor), int(patch)
|
||||
# urllib3 >= 1.21.1, <= 1.26
|
||||
assert major == 1
|
||||
assert minor >= 21
|
||||
assert minor <= 26
|
||||
|
||||
# Check charset_normalizer for compatibility.
|
||||
if chardet_version:
|
||||
major, minor, patch = chardet_version.split(".")[:3]
|
||||
major, minor, patch = int(major), int(minor), int(patch)
|
||||
# chardet_version >= 3.0.2, < 6.0.0
|
||||
assert (3, 0, 2) <= (major, minor, patch) < (6, 0, 0)
|
||||
elif charset_normalizer_version:
|
||||
major, minor, patch = charset_normalizer_version.split(".")[:3]
|
||||
major, minor, patch = int(major), int(minor), int(patch)
|
||||
# charset_normalizer >= 2.0.0 < 3.0.0
|
||||
assert (2, 0, 0) <= (major, minor, patch) < (3, 0, 0)
|
||||
else:
|
||||
raise Exception("You need either charset_normalizer or chardet installed")
|
||||
|
||||
|
||||
def _check_cryptography(cryptography_version):
|
||||
# cryptography < 1.3.4
|
||||
try:
|
||||
cryptography_version = list(map(int, cryptography_version.split(".")))
|
||||
except ValueError:
|
||||
return
|
||||
|
||||
if cryptography_version < [1, 3, 4]:
|
||||
warning = "Old version of cryptography ({}) may cause slowdown.".format(
|
||||
cryptography_version
|
||||
)
|
||||
warnings.warn(warning, RequestsDependencyWarning)
|
||||
|
||||
|
||||
# Check imported dependencies for compatibility.
|
||||
try:
|
||||
check_compatibility(
|
||||
urllib3.__version__, chardet_version, charset_normalizer_version
|
||||
)
|
||||
except (AssertionError, ValueError):
|
||||
warnings.warn(
|
||||
"urllib3 ({}) or chardet ({})/charset_normalizer ({}) doesn't match a supported "
|
||||
"version!".format(
|
||||
urllib3.__version__, chardet_version, charset_normalizer_version
|
||||
),
|
||||
RequestsDependencyWarning,
|
||||
)
|
||||
|
||||
# Attempt to enable urllib3's fallback for SNI support
|
||||
# if the standard library doesn't support SNI or the
|
||||
# 'ssl' library isn't available.
|
||||
try:
|
||||
try:
|
||||
import ssl
|
||||
except ImportError:
|
||||
ssl = None
|
||||
|
||||
if not getattr(ssl, "HAS_SNI", False):
|
||||
from urllib3.contrib import pyopenssl
|
||||
|
||||
pyopenssl.inject_into_urllib3()
|
||||
|
||||
# Check cryptography version
|
||||
from cryptography import __version__ as cryptography_version
|
||||
|
||||
_check_cryptography(cryptography_version)
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
# urllib3's DependencyWarnings should be silenced.
|
||||
from urllib3.exceptions import DependencyWarning
|
||||
|
||||
warnings.simplefilter("ignore", DependencyWarning)
|
||||
|
||||
# Set default logging handler to avoid "No handler found" warnings.
|
||||
import logging
|
||||
from logging import NullHandler
|
||||
|
||||
from . import packages, utils
|
||||
from .__version__ import (
|
||||
__author__,
|
||||
__author_email__,
|
||||
__build__,
|
||||
__cake__,
|
||||
__copyright__,
|
||||
__description__,
|
||||
__license__,
|
||||
__title__,
|
||||
__url__,
|
||||
__version__,
|
||||
)
|
||||
from .api import delete, get, head, options, patch, post, put, request
|
||||
from .exceptions import (
|
||||
ConnectionError,
|
||||
ConnectTimeout,
|
||||
FileModeWarning,
|
||||
HTTPError,
|
||||
JSONDecodeError,
|
||||
ReadTimeout,
|
||||
RequestException,
|
||||
Timeout,
|
||||
TooManyRedirects,
|
||||
URLRequired,
|
||||
)
|
||||
from .models import PreparedRequest, Request, Response
|
||||
from .sessions import Session, session
|
||||
from .status_codes import codes
|
||||
|
||||
logging.getLogger(__name__).addHandler(NullHandler())
|
||||
|
||||
# FileModeWarnings go off per the default.
|
||||
warnings.simplefilter("default", FileModeWarning, append=True)
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,14 @@
|
||||
# .-. .-. .-. . . .-. .-. .-. .-.
|
||||
# |( |- |.| | | |- `-. | `-.
|
||||
# ' ' `-' `-`.`-' `-' `-' ' `-'
|
||||
|
||||
__title__ = "requests"
|
||||
__description__ = "Python HTTP for Humans."
|
||||
__url__ = "https://requests.readthedocs.io"
|
||||
__version__ = "2.28.1"
|
||||
__build__ = 0x022801
|
||||
__author__ = "Kenneth Reitz"
|
||||
__author_email__ = "me@kennethreitz.org"
|
||||
__license__ = "Apache 2.0"
|
||||
__copyright__ = "Copyright 2022 Kenneth Reitz"
|
||||
__cake__ = "\u2728 \U0001f370 \u2728"
|
||||
@@ -0,0 +1,48 @@
|
||||
"""
|
||||
requests._internal_utils
|
||||
~~~~~~~~~~~~~~
|
||||
|
||||
Provides utility functions that are consumed internally by Requests
|
||||
which depend on extremely few external helpers (such as compat)
|
||||
"""
|
||||
import re
|
||||
|
||||
from .compat import builtin_str
|
||||
|
||||
_VALID_HEADER_NAME_RE_BYTE = re.compile(rb"^[^:\s][^:\r\n]*$")
|
||||
_VALID_HEADER_NAME_RE_STR = re.compile(r"^[^:\s][^:\r\n]*$")
|
||||
_VALID_HEADER_VALUE_RE_BYTE = re.compile(rb"^\S[^\r\n]*$|^$")
|
||||
_VALID_HEADER_VALUE_RE_STR = re.compile(r"^\S[^\r\n]*$|^$")
|
||||
|
||||
HEADER_VALIDATORS = {
|
||||
bytes: (_VALID_HEADER_NAME_RE_BYTE, _VALID_HEADER_VALUE_RE_BYTE),
|
||||
str: (_VALID_HEADER_NAME_RE_STR, _VALID_HEADER_VALUE_RE_STR),
|
||||
}
|
||||
|
||||
|
||||
def to_native_string(string, encoding="ascii"):
|
||||
"""Given a string object, regardless of type, returns a representation of
|
||||
that string in the native string type, encoding and decoding where
|
||||
necessary. This assumes ASCII unless told otherwise.
|
||||
"""
|
||||
if isinstance(string, builtin_str):
|
||||
out = string
|
||||
else:
|
||||
out = string.decode(encoding)
|
||||
|
||||
return out
|
||||
|
||||
|
||||
def unicode_is_ascii(u_string):
|
||||
"""Determine if unicode string only contains ASCII characters.
|
||||
|
||||
:param str u_string: unicode string to check. Must be unicode
|
||||
and not Python 2 `str`.
|
||||
:rtype: bool
|
||||
"""
|
||||
assert isinstance(u_string, str)
|
||||
try:
|
||||
u_string.encode("ascii")
|
||||
return True
|
||||
except UnicodeEncodeError:
|
||||
return False
|
||||
@@ -0,0 +1,584 @@
|
||||
"""
|
||||
requests.adapters
|
||||
~~~~~~~~~~~~~~~~~
|
||||
|
||||
This module contains the transport adapters that Requests uses to define
|
||||
and maintain connections.
|
||||
"""
|
||||
|
||||
import os.path
|
||||
import socket # noqa: F401
|
||||
|
||||
from urllib3.exceptions import ClosedPoolError, ConnectTimeoutError
|
||||
from urllib3.exceptions import HTTPError as _HTTPError
|
||||
from urllib3.exceptions import InvalidHeader as _InvalidHeader
|
||||
from urllib3.exceptions import (
|
||||
LocationValueError,
|
||||
MaxRetryError,
|
||||
NewConnectionError,
|
||||
ProtocolError,
|
||||
)
|
||||
from urllib3.exceptions import ProxyError as _ProxyError
|
||||
from urllib3.exceptions import ReadTimeoutError, ResponseError
|
||||
from urllib3.exceptions import SSLError as _SSLError
|
||||
from urllib3.poolmanager import PoolManager, proxy_from_url
|
||||
from urllib3.response import HTTPResponse
|
||||
from urllib3.util import Timeout as TimeoutSauce
|
||||
from urllib3.util import parse_url
|
||||
from urllib3.util.retry import Retry
|
||||
|
||||
from .auth import _basic_auth_str
|
||||
from .compat import basestring, urlparse
|
||||
from .cookies import extract_cookies_to_jar
|
||||
from .exceptions import (
|
||||
ConnectionError,
|
||||
ConnectTimeout,
|
||||
InvalidHeader,
|
||||
InvalidProxyURL,
|
||||
InvalidSchema,
|
||||
InvalidURL,
|
||||
ProxyError,
|
||||
ReadTimeout,
|
||||
RetryError,
|
||||
SSLError,
|
||||
)
|
||||
from .models import Response
|
||||
from .structures import CaseInsensitiveDict
|
||||
from .utils import (
|
||||
DEFAULT_CA_BUNDLE_PATH,
|
||||
extract_zipped_paths,
|
||||
get_auth_from_url,
|
||||
get_encoding_from_headers,
|
||||
prepend_scheme_if_needed,
|
||||
select_proxy,
|
||||
urldefragauth,
|
||||
)
|
||||
|
||||
try:
|
||||
from urllib3.contrib.socks import SOCKSProxyManager
|
||||
except ImportError:
|
||||
|
||||
def SOCKSProxyManager(*args, **kwargs):
|
||||
raise InvalidSchema("Missing dependencies for SOCKS support.")
|
||||
|
||||
|
||||
DEFAULT_POOLBLOCK = False
|
||||
DEFAULT_POOLSIZE = 10
|
||||
DEFAULT_RETRIES = 0
|
||||
DEFAULT_POOL_TIMEOUT = None
|
||||
|
||||
|
||||
class BaseAdapter:
|
||||
"""The Base Transport Adapter"""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
|
||||
def send(
|
||||
self, request, stream=False, timeout=None, verify=True, cert=None, proxies=None
|
||||
):
|
||||
"""Sends PreparedRequest object. Returns Response object.
|
||||
|
||||
:param request: The :class:`PreparedRequest <PreparedRequest>` being sent.
|
||||
:param stream: (optional) Whether to stream the request content.
|
||||
:param timeout: (optional) How long to wait for the server to send
|
||||
data before giving up, as a float, or a :ref:`(connect timeout,
|
||||
read timeout) <timeouts>` tuple.
|
||||
:type timeout: float or tuple
|
||||
:param verify: (optional) Either a boolean, in which case it controls whether we verify
|
||||
the server's TLS certificate, or a string, in which case it must be a path
|
||||
to a CA bundle to use
|
||||
:param cert: (optional) Any user-provided SSL certificate to be trusted.
|
||||
:param proxies: (optional) The proxies dictionary to apply to the request.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def close(self):
|
||||
"""Cleans up adapter specific items."""
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
class HTTPAdapter(BaseAdapter):
|
||||
"""The built-in HTTP Adapter for urllib3.
|
||||
|
||||
Provides a general-case interface for Requests sessions to contact HTTP and
|
||||
HTTPS urls by implementing the Transport Adapter interface. This class will
|
||||
usually be created by the :class:`Session <Session>` class under the
|
||||
covers.
|
||||
|
||||
:param pool_connections: The number of urllib3 connection pools to cache.
|
||||
:param pool_maxsize: The maximum number of connections to save in the pool.
|
||||
:param max_retries: The maximum number of retries each connection
|
||||
should attempt. Note, this applies only to failed DNS lookups, socket
|
||||
connections and connection timeouts, never to requests where data has
|
||||
made it to the server. By default, Requests does not retry failed
|
||||
connections. If you need granular control over the conditions under
|
||||
which we retry a request, import urllib3's ``Retry`` class and pass
|
||||
that instead.
|
||||
:param pool_block: Whether the connection pool should block for connections.
|
||||
|
||||
Usage::
|
||||
|
||||
>>> import requests
|
||||
>>> s = requests.Session()
|
||||
>>> a = requests.adapters.HTTPAdapter(max_retries=3)
|
||||
>>> s.mount('http://', a)
|
||||
"""
|
||||
|
||||
__attrs__ = [
|
||||
"max_retries",
|
||||
"config",
|
||||
"_pool_connections",
|
||||
"_pool_maxsize",
|
||||
"_pool_block",
|
||||
]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
pool_connections=DEFAULT_POOLSIZE,
|
||||
pool_maxsize=DEFAULT_POOLSIZE,
|
||||
max_retries=DEFAULT_RETRIES,
|
||||
pool_block=DEFAULT_POOLBLOCK,
|
||||
):
|
||||
if max_retries == DEFAULT_RETRIES:
|
||||
self.max_retries = Retry(0, read=False)
|
||||
else:
|
||||
self.max_retries = Retry.from_int(max_retries)
|
||||
self.config = {}
|
||||
self.proxy_manager = {}
|
||||
|
||||
super().__init__()
|
||||
|
||||
self._pool_connections = pool_connections
|
||||
self._pool_maxsize = pool_maxsize
|
||||
self._pool_block = pool_block
|
||||
|
||||
self.init_poolmanager(pool_connections, pool_maxsize, block=pool_block)
|
||||
|
||||
def __getstate__(self):
|
||||
return {attr: getattr(self, attr, None) for attr in self.__attrs__}
|
||||
|
||||
def __setstate__(self, state):
|
||||
# Can't handle by adding 'proxy_manager' to self.__attrs__ because
|
||||
# self.poolmanager uses a lambda function, which isn't pickleable.
|
||||
self.proxy_manager = {}
|
||||
self.config = {}
|
||||
|
||||
for attr, value in state.items():
|
||||
setattr(self, attr, value)
|
||||
|
||||
self.init_poolmanager(
|
||||
self._pool_connections, self._pool_maxsize, block=self._pool_block
|
||||
)
|
||||
|
||||
def init_poolmanager(
|
||||
self, connections, maxsize, block=DEFAULT_POOLBLOCK, **pool_kwargs
|
||||
):
|
||||
"""Initializes a urllib3 PoolManager.
|
||||
|
||||
This method should not be called from user code, and is only
|
||||
exposed for use when subclassing the
|
||||
:class:`HTTPAdapter <requests.adapters.HTTPAdapter>`.
|
||||
|
||||
:param connections: The number of urllib3 connection pools to cache.
|
||||
:param maxsize: The maximum number of connections to save in the pool.
|
||||
:param block: Block when no free connections are available.
|
||||
:param pool_kwargs: Extra keyword arguments used to initialize the Pool Manager.
|
||||
"""
|
||||
# save these values for pickling
|
||||
self._pool_connections = connections
|
||||
self._pool_maxsize = maxsize
|
||||
self._pool_block = block
|
||||
|
||||
self.poolmanager = PoolManager(
|
||||
num_pools=connections,
|
||||
maxsize=maxsize,
|
||||
block=block,
|
||||
strict=True,
|
||||
**pool_kwargs,
|
||||
)
|
||||
|
||||
def proxy_manager_for(self, proxy, **proxy_kwargs):
|
||||
"""Return urllib3 ProxyManager for the given proxy.
|
||||
|
||||
This method should not be called from user code, and is only
|
||||
exposed for use when subclassing the
|
||||
:class:`HTTPAdapter <requests.adapters.HTTPAdapter>`.
|
||||
|
||||
:param proxy: The proxy to return a urllib3 ProxyManager for.
|
||||
:param proxy_kwargs: Extra keyword arguments used to configure the Proxy Manager.
|
||||
:returns: ProxyManager
|
||||
:rtype: urllib3.ProxyManager
|
||||
"""
|
||||
if proxy in self.proxy_manager:
|
||||
manager = self.proxy_manager[proxy]
|
||||
elif proxy.lower().startswith("socks"):
|
||||
username, password = get_auth_from_url(proxy)
|
||||
manager = self.proxy_manager[proxy] = SOCKSProxyManager(
|
||||
proxy,
|
||||
username=username,
|
||||
password=password,
|
||||
num_pools=self._pool_connections,
|
||||
maxsize=self._pool_maxsize,
|
||||
block=self._pool_block,
|
||||
**proxy_kwargs,
|
||||
)
|
||||
else:
|
||||
proxy_headers = self.proxy_headers(proxy)
|
||||
manager = self.proxy_manager[proxy] = proxy_from_url(
|
||||
proxy,
|
||||
proxy_headers=proxy_headers,
|
||||
num_pools=self._pool_connections,
|
||||
maxsize=self._pool_maxsize,
|
||||
block=self._pool_block,
|
||||
**proxy_kwargs,
|
||||
)
|
||||
|
||||
return manager
|
||||
|
||||
def cert_verify(self, conn, url, verify, cert):
|
||||
"""Verify a SSL certificate. This method should not be called from user
|
||||
code, and is only exposed for use when subclassing the
|
||||
:class:`HTTPAdapter <requests.adapters.HTTPAdapter>`.
|
||||
|
||||
:param conn: The urllib3 connection object associated with the cert.
|
||||
:param url: The requested URL.
|
||||
:param verify: Either a boolean, in which case it controls whether we verify
|
||||
the server's TLS certificate, or a string, in which case it must be a path
|
||||
to a CA bundle to use
|
||||
:param cert: The SSL certificate to verify.
|
||||
"""
|
||||
if url.lower().startswith("https") and verify:
|
||||
|
||||
cert_loc = None
|
||||
|
||||
# Allow self-specified cert location.
|
||||
if verify is not True:
|
||||
cert_loc = verify
|
||||
|
||||
if not cert_loc:
|
||||
cert_loc = extract_zipped_paths(DEFAULT_CA_BUNDLE_PATH)
|
||||
|
||||
if not cert_loc or not os.path.exists(cert_loc):
|
||||
raise OSError(
|
||||
f"Could not find a suitable TLS CA certificate bundle, "
|
||||
f"invalid path: {cert_loc}"
|
||||
)
|
||||
|
||||
conn.cert_reqs = "CERT_REQUIRED"
|
||||
|
||||
if not os.path.isdir(cert_loc):
|
||||
conn.ca_certs = cert_loc
|
||||
else:
|
||||
conn.ca_cert_dir = cert_loc
|
||||
else:
|
||||
conn.cert_reqs = "CERT_NONE"
|
||||
conn.ca_certs = None
|
||||
conn.ca_cert_dir = None
|
||||
|
||||
if cert:
|
||||
if not isinstance(cert, basestring):
|
||||
conn.cert_file = cert[0]
|
||||
conn.key_file = cert[1]
|
||||
else:
|
||||
conn.cert_file = cert
|
||||
conn.key_file = None
|
||||
if conn.cert_file and not os.path.exists(conn.cert_file):
|
||||
raise OSError(
|
||||
f"Could not find the TLS certificate file, "
|
||||
f"invalid path: {conn.cert_file}"
|
||||
)
|
||||
if conn.key_file and not os.path.exists(conn.key_file):
|
||||
raise OSError(
|
||||
f"Could not find the TLS key file, invalid path: {conn.key_file}"
|
||||
)
|
||||
|
||||
def build_response(self, req, resp):
|
||||
"""Builds a :class:`Response <requests.Response>` object from a urllib3
|
||||
response. This should not be called from user code, and is only exposed
|
||||
for use when subclassing the
|
||||
:class:`HTTPAdapter <requests.adapters.HTTPAdapter>`
|
||||
|
||||
:param req: The :class:`PreparedRequest <PreparedRequest>` used to generate the response.
|
||||
:param resp: The urllib3 response object.
|
||||
:rtype: requests.Response
|
||||
"""
|
||||
response = Response()
|
||||
|
||||
# Fallback to None if there's no status_code, for whatever reason.
|
||||
response.status_code = getattr(resp, "status", None)
|
||||
|
||||
# Make headers case-insensitive.
|
||||
response.headers = CaseInsensitiveDict(getattr(resp, "headers", {}))
|
||||
|
||||
# Set encoding.
|
||||
response.encoding = get_encoding_from_headers(response.headers)
|
||||
response.raw = resp
|
||||
response.reason = response.raw.reason
|
||||
|
||||
if isinstance(req.url, bytes):
|
||||
response.url = req.url.decode("utf-8")
|
||||
else:
|
||||
response.url = req.url
|
||||
|
||||
# Add new cookies from the server.
|
||||
extract_cookies_to_jar(response.cookies, req, resp)
|
||||
|
||||
# Give the Response some context.
|
||||
response.request = req
|
||||
response.connection = self
|
||||
|
||||
return response
|
||||
|
||||
def get_connection(self, url, proxies=None):
|
||||
"""Returns a urllib3 connection for the given URL. This should not be
|
||||
called from user code, and is only exposed for use when subclassing the
|
||||
:class:`HTTPAdapter <requests.adapters.HTTPAdapter>`.
|
||||
|
||||
:param url: The URL to connect to.
|
||||
:param proxies: (optional) A Requests-style dictionary of proxies used on this request.
|
||||
:rtype: urllib3.ConnectionPool
|
||||
"""
|
||||
proxy = select_proxy(url, proxies)
|
||||
|
||||
if proxy:
|
||||
proxy = prepend_scheme_if_needed(proxy, "http")
|
||||
proxy_url = parse_url(proxy)
|
||||
if not proxy_url.host:
|
||||
raise InvalidProxyURL(
|
||||
"Please check proxy URL. It is malformed "
|
||||
"and could be missing the host."
|
||||
)
|
||||
proxy_manager = self.proxy_manager_for(proxy)
|
||||
conn = proxy_manager.connection_from_url(url)
|
||||
else:
|
||||
# Only scheme should be lower case
|
||||
parsed = urlparse(url)
|
||||
url = parsed.geturl()
|
||||
conn = self.poolmanager.connection_from_url(url)
|
||||
|
||||
return conn
|
||||
|
||||
def close(self):
|
||||
"""Disposes of any internal state.
|
||||
|
||||
Currently, this closes the PoolManager and any active ProxyManager,
|
||||
which closes any pooled connections.
|
||||
"""
|
||||
self.poolmanager.clear()
|
||||
for proxy in self.proxy_manager.values():
|
||||
proxy.clear()
|
||||
|
||||
def request_url(self, request, proxies):
|
||||
"""Obtain the url to use when making the final request.
|
||||
|
||||
If the message is being sent through a HTTP proxy, the full URL has to
|
||||
be used. Otherwise, we should only use the path portion of the URL.
|
||||
|
||||
This should not be called from user code, and is only exposed for use
|
||||
when subclassing the
|
||||
:class:`HTTPAdapter <requests.adapters.HTTPAdapter>`.
|
||||
|
||||
:param request: The :class:`PreparedRequest <PreparedRequest>` being sent.
|
||||
:param proxies: A dictionary of schemes or schemes and hosts to proxy URLs.
|
||||
:rtype: str
|
||||
"""
|
||||
proxy = select_proxy(request.url, proxies)
|
||||
scheme = urlparse(request.url).scheme
|
||||
|
||||
is_proxied_http_request = proxy and scheme != "https"
|
||||
using_socks_proxy = False
|
||||
if proxy:
|
||||
proxy_scheme = urlparse(proxy).scheme.lower()
|
||||
using_socks_proxy = proxy_scheme.startswith("socks")
|
||||
|
||||
url = request.path_url
|
||||
if is_proxied_http_request and not using_socks_proxy:
|
||||
url = urldefragauth(request.url)
|
||||
|
||||
return url
|
||||
|
||||
def add_headers(self, request, **kwargs):
|
||||
"""Add any headers needed by the connection. As of v2.0 this does
|
||||
nothing by default, but is left for overriding by users that subclass
|
||||
the :class:`HTTPAdapter <requests.adapters.HTTPAdapter>`.
|
||||
|
||||
This should not be called from user code, and is only exposed for use
|
||||
when subclassing the
|
||||
:class:`HTTPAdapter <requests.adapters.HTTPAdapter>`.
|
||||
|
||||
:param request: The :class:`PreparedRequest <PreparedRequest>` to add headers to.
|
||||
:param kwargs: The keyword arguments from the call to send().
|
||||
"""
|
||||
pass
|
||||
|
||||
def proxy_headers(self, proxy):
|
||||
"""Returns a dictionary of the headers to add to any request sent
|
||||
through a proxy. This works with urllib3 magic to ensure that they are
|
||||
correctly sent to the proxy, rather than in a tunnelled request if
|
||||
CONNECT is being used.
|
||||
|
||||
This should not be called from user code, and is only exposed for use
|
||||
when subclassing the
|
||||
:class:`HTTPAdapter <requests.adapters.HTTPAdapter>`.
|
||||
|
||||
:param proxy: The url of the proxy being used for this request.
|
||||
:rtype: dict
|
||||
"""
|
||||
headers = {}
|
||||
username, password = get_auth_from_url(proxy)
|
||||
|
||||
if username:
|
||||
headers["Proxy-Authorization"] = _basic_auth_str(username, password)
|
||||
|
||||
return headers
|
||||
|
||||
def send(
|
||||
self, request, stream=False, timeout=None, verify=True, cert=None, proxies=None
|
||||
):
|
||||
"""Sends PreparedRequest object. Returns Response object.
|
||||
|
||||
:param request: The :class:`PreparedRequest <PreparedRequest>` being sent.
|
||||
:param stream: (optional) Whether to stream the request content.
|
||||
:param timeout: (optional) How long to wait for the server to send
|
||||
data before giving up, as a float, or a :ref:`(connect timeout,
|
||||
read timeout) <timeouts>` tuple.
|
||||
:type timeout: float or tuple or urllib3 Timeout object
|
||||
:param verify: (optional) Either a boolean, in which case it controls whether
|
||||
we verify the server's TLS certificate, or a string, in which case it
|
||||
must be a path to a CA bundle to use
|
||||
:param cert: (optional) Any user-provided SSL certificate to be trusted.
|
||||
:param proxies: (optional) The proxies dictionary to apply to the request.
|
||||
:rtype: requests.Response
|
||||
"""
|
||||
|
||||
try:
|
||||
conn = self.get_connection(request.url, proxies)
|
||||
except LocationValueError as e:
|
||||
raise InvalidURL(e, request=request)
|
||||
|
||||
self.cert_verify(conn, request.url, verify, cert)
|
||||
url = self.request_url(request, proxies)
|
||||
self.add_headers(
|
||||
request,
|
||||
stream=stream,
|
||||
timeout=timeout,
|
||||
verify=verify,
|
||||
cert=cert,
|
||||
proxies=proxies,
|
||||
)
|
||||
|
||||
chunked = not (request.body is None or "Content-Length" in request.headers)
|
||||
|
||||
if isinstance(timeout, tuple):
|
||||
try:
|
||||
connect, read = timeout
|
||||
timeout = TimeoutSauce(connect=connect, read=read)
|
||||
except ValueError:
|
||||
raise ValueError(
|
||||
f"Invalid timeout {timeout}. Pass a (connect, read) timeout tuple, "
|
||||
f"or a single float to set both timeouts to the same value."
|
||||
)
|
||||
elif isinstance(timeout, TimeoutSauce):
|
||||
pass
|
||||
else:
|
||||
timeout = TimeoutSauce(connect=timeout, read=timeout)
|
||||
|
||||
try:
|
||||
if not chunked:
|
||||
resp = conn.urlopen(
|
||||
method=request.method,
|
||||
url=url,
|
||||
body=request.body,
|
||||
headers=request.headers,
|
||||
redirect=False,
|
||||
assert_same_host=False,
|
||||
preload_content=False,
|
||||
decode_content=False,
|
||||
retries=self.max_retries,
|
||||
timeout=timeout,
|
||||
)
|
||||
|
||||
# Send the request.
|
||||
else:
|
||||
if hasattr(conn, "proxy_pool"):
|
||||
conn = conn.proxy_pool
|
||||
|
||||
low_conn = conn._get_conn(timeout=DEFAULT_POOL_TIMEOUT)
|
||||
|
||||
try:
|
||||
skip_host = "Host" in request.headers
|
||||
low_conn.putrequest(
|
||||
request.method,
|
||||
url,
|
||||
skip_accept_encoding=True,
|
||||
skip_host=skip_host,
|
||||
)
|
||||
|
||||
for header, value in request.headers.items():
|
||||
low_conn.putheader(header, value)
|
||||
|
||||
low_conn.endheaders()
|
||||
|
||||
for i in request.body:
|
||||
low_conn.send(hex(len(i))[2:].encode("utf-8"))
|
||||
low_conn.send(b"\r\n")
|
||||
low_conn.send(i)
|
||||
low_conn.send(b"\r\n")
|
||||
low_conn.send(b"0\r\n\r\n")
|
||||
|
||||
# Receive the response from the server
|
||||
r = low_conn.getresponse()
|
||||
|
||||
resp = HTTPResponse.from_httplib(
|
||||
r,
|
||||
pool=conn,
|
||||
connection=low_conn,
|
||||
preload_content=False,
|
||||
decode_content=False,
|
||||
)
|
||||
except Exception:
|
||||
# If we hit any problems here, clean up the connection.
|
||||
# Then, raise so that we can handle the actual exception.
|
||||
low_conn.close()
|
||||
raise
|
||||
|
||||
except (ProtocolError, OSError) as err:
|
||||
raise ConnectionError(err, request=request)
|
||||
|
||||
except MaxRetryError as e:
|
||||
if isinstance(e.reason, ConnectTimeoutError):
|
||||
# TODO: Remove this in 3.0.0: see #2811
|
||||
if not isinstance(e.reason, NewConnectionError):
|
||||
raise ConnectTimeout(e, request=request)
|
||||
|
||||
if isinstance(e.reason, ResponseError):
|
||||
raise RetryError(e, request=request)
|
||||
|
||||
if isinstance(e.reason, _ProxyError):
|
||||
raise ProxyError(e, request=request)
|
||||
|
||||
if isinstance(e.reason, _SSLError):
|
||||
# This branch is for urllib3 v1.22 and later.
|
||||
raise SSLError(e, request=request)
|
||||
|
||||
raise ConnectionError(e, request=request)
|
||||
|
||||
except ClosedPoolError as e:
|
||||
raise ConnectionError(e, request=request)
|
||||
|
||||
except _ProxyError as e:
|
||||
raise ProxyError(e)
|
||||
|
||||
except (_SSLError, _HTTPError) as e:
|
||||
if isinstance(e, _SSLError):
|
||||
# This branch is for urllib3 versions earlier than v1.22
|
||||
raise SSLError(e, request=request)
|
||||
elif isinstance(e, ReadTimeoutError):
|
||||
raise ReadTimeout(e, request=request)
|
||||
elif isinstance(e, _InvalidHeader):
|
||||
raise InvalidHeader(e, request=request)
|
||||
else:
|
||||
raise
|
||||
|
||||
return self.build_response(request, resp)
|
||||
@@ -0,0 +1,157 @@
|
||||
"""
|
||||
requests.api
|
||||
~~~~~~~~~~~~
|
||||
|
||||
This module implements the Requests API.
|
||||
|
||||
:copyright: (c) 2012 by Kenneth Reitz.
|
||||
:license: Apache2, see LICENSE for more details.
|
||||
"""
|
||||
|
||||
from . import sessions
|
||||
|
||||
|
||||
def request(method, url, **kwargs):
|
||||
"""Constructs and sends a :class:`Request <Request>`.
|
||||
|
||||
:param method: method for the new :class:`Request` object: ``GET``, ``OPTIONS``, ``HEAD``, ``POST``, ``PUT``, ``PATCH``, or ``DELETE``.
|
||||
:param url: URL for the new :class:`Request` object.
|
||||
:param params: (optional) Dictionary, list of tuples or bytes to send
|
||||
in the query string for the :class:`Request`.
|
||||
:param data: (optional) Dictionary, list of tuples, bytes, or file-like
|
||||
object to send in the body of the :class:`Request`.
|
||||
:param json: (optional) A JSON serializable Python object to send in the body of the :class:`Request`.
|
||||
:param headers: (optional) Dictionary of HTTP Headers to send with the :class:`Request`.
|
||||
:param cookies: (optional) Dict or CookieJar object to send with the :class:`Request`.
|
||||
:param files: (optional) Dictionary of ``'name': file-like-objects`` (or ``{'name': file-tuple}``) for multipart encoding upload.
|
||||
``file-tuple`` can be a 2-tuple ``('filename', fileobj)``, 3-tuple ``('filename', fileobj, 'content_type')``
|
||||
or a 4-tuple ``('filename', fileobj, 'content_type', custom_headers)``, where ``'content-type'`` is a string
|
||||
defining the content type of the given file and ``custom_headers`` a dict-like object containing additional headers
|
||||
to add for the file.
|
||||
:param auth: (optional) Auth tuple to enable Basic/Digest/Custom HTTP Auth.
|
||||
:param timeout: (optional) How many seconds to wait for the server to send data
|
||||
before giving up, as a float, or a :ref:`(connect timeout, read
|
||||
timeout) <timeouts>` tuple.
|
||||
:type timeout: float or tuple
|
||||
:param allow_redirects: (optional) Boolean. Enable/disable GET/OPTIONS/POST/PUT/PATCH/DELETE/HEAD redirection. Defaults to ``True``.
|
||||
:type allow_redirects: bool
|
||||
:param proxies: (optional) Dictionary mapping protocol to the URL of the proxy.
|
||||
:param verify: (optional) Either a boolean, in which case it controls whether we verify
|
||||
the server's TLS certificate, or a string, in which case it must be a path
|
||||
to a CA bundle to use. Defaults to ``True``.
|
||||
:param stream: (optional) if ``False``, the response content will be immediately downloaded.
|
||||
:param cert: (optional) if String, path to ssl client cert file (.pem). If Tuple, ('cert', 'key') pair.
|
||||
:return: :class:`Response <Response>` object
|
||||
:rtype: requests.Response
|
||||
|
||||
Usage::
|
||||
|
||||
>>> import requests
|
||||
>>> req = requests.request('GET', 'https://httpbin.org/get')
|
||||
>>> req
|
||||
<Response [200]>
|
||||
"""
|
||||
|
||||
# By using the 'with' statement we are sure the session is closed, thus we
|
||||
# avoid leaving sockets open which can trigger a ResourceWarning in some
|
||||
# cases, and look like a memory leak in others.
|
||||
with sessions.Session() as session:
|
||||
return session.request(method=method, url=url, **kwargs)
|
||||
|
||||
|
||||
def get(url, params=None, **kwargs):
|
||||
r"""Sends a GET request.
|
||||
|
||||
:param url: URL for the new :class:`Request` object.
|
||||
:param params: (optional) Dictionary, list of tuples or bytes to send
|
||||
in the query string for the :class:`Request`.
|
||||
:param \*\*kwargs: Optional arguments that ``request`` takes.
|
||||
:return: :class:`Response <Response>` object
|
||||
:rtype: requests.Response
|
||||
"""
|
||||
|
||||
return request("get", url, params=params, **kwargs)
|
||||
|
||||
|
||||
def options(url, **kwargs):
|
||||
r"""Sends an OPTIONS request.
|
||||
|
||||
:param url: URL for the new :class:`Request` object.
|
||||
:param \*\*kwargs: Optional arguments that ``request`` takes.
|
||||
:return: :class:`Response <Response>` object
|
||||
:rtype: requests.Response
|
||||
"""
|
||||
|
||||
return request("options", url, **kwargs)
|
||||
|
||||
|
||||
def head(url, **kwargs):
|
||||
r"""Sends a HEAD request.
|
||||
|
||||
:param url: URL for the new :class:`Request` object.
|
||||
:param \*\*kwargs: Optional arguments that ``request`` takes. If
|
||||
`allow_redirects` is not provided, it will be set to `False` (as
|
||||
opposed to the default :meth:`request` behavior).
|
||||
:return: :class:`Response <Response>` object
|
||||
:rtype: requests.Response
|
||||
"""
|
||||
|
||||
kwargs.setdefault("allow_redirects", False)
|
||||
return request("head", url, **kwargs)
|
||||
|
||||
|
||||
def post(url, data=None, json=None, **kwargs):
|
||||
r"""Sends a POST request.
|
||||
|
||||
:param url: URL for the new :class:`Request` object.
|
||||
:param data: (optional) Dictionary, list of tuples, bytes, or file-like
|
||||
object to send in the body of the :class:`Request`.
|
||||
:param json: (optional) json data to send in the body of the :class:`Request`.
|
||||
:param \*\*kwargs: Optional arguments that ``request`` takes.
|
||||
:return: :class:`Response <Response>` object
|
||||
:rtype: requests.Response
|
||||
"""
|
||||
|
||||
return request("post", url, data=data, json=json, **kwargs)
|
||||
|
||||
|
||||
def put(url, data=None, **kwargs):
|
||||
r"""Sends a PUT request.
|
||||
|
||||
:param url: URL for the new :class:`Request` object.
|
||||
:param data: (optional) Dictionary, list of tuples, bytes, or file-like
|
||||
object to send in the body of the :class:`Request`.
|
||||
:param json: (optional) json data to send in the body of the :class:`Request`.
|
||||
:param \*\*kwargs: Optional arguments that ``request`` takes.
|
||||
:return: :class:`Response <Response>` object
|
||||
:rtype: requests.Response
|
||||
"""
|
||||
|
||||
return request("put", url, data=data, **kwargs)
|
||||
|
||||
|
||||
def patch(url, data=None, **kwargs):
|
||||
r"""Sends a PATCH request.
|
||||
|
||||
:param url: URL for the new :class:`Request` object.
|
||||
:param data: (optional) Dictionary, list of tuples, bytes, or file-like
|
||||
object to send in the body of the :class:`Request`.
|
||||
:param json: (optional) json data to send in the body of the :class:`Request`.
|
||||
:param \*\*kwargs: Optional arguments that ``request`` takes.
|
||||
:return: :class:`Response <Response>` object
|
||||
:rtype: requests.Response
|
||||
"""
|
||||
|
||||
return request("patch", url, data=data, **kwargs)
|
||||
|
||||
|
||||
def delete(url, **kwargs):
|
||||
r"""Sends a DELETE request.
|
||||
|
||||
:param url: URL for the new :class:`Request` object.
|
||||
:param \*\*kwargs: Optional arguments that ``request`` takes.
|
||||
:return: :class:`Response <Response>` object
|
||||
:rtype: requests.Response
|
||||
"""
|
||||
|
||||
return request("delete", url, **kwargs)
|
||||
@@ -0,0 +1,315 @@
|
||||
"""
|
||||
requests.auth
|
||||
~~~~~~~~~~~~~
|
||||
|
||||
This module contains the authentication handlers for Requests.
|
||||
"""
|
||||
|
||||
import hashlib
|
||||
import os
|
||||
import re
|
||||
import threading
|
||||
import time
|
||||
import warnings
|
||||
from base64 import b64encode
|
||||
|
||||
from ._internal_utils import to_native_string
|
||||
from .compat import basestring, str, urlparse
|
||||
from .cookies import extract_cookies_to_jar
|
||||
from .utils import parse_dict_header
|
||||
|
||||
CONTENT_TYPE_FORM_URLENCODED = "application/x-www-form-urlencoded"
|
||||
CONTENT_TYPE_MULTI_PART = "multipart/form-data"
|
||||
|
||||
|
||||
def _basic_auth_str(username, password):
|
||||
"""Returns a Basic Auth string."""
|
||||
|
||||
# "I want us to put a big-ol' comment on top of it that
|
||||
# says that this behaviour is dumb but we need to preserve
|
||||
# it because people are relying on it."
|
||||
# - Lukasa
|
||||
#
|
||||
# These are here solely to maintain backwards compatibility
|
||||
# for things like ints. This will be removed in 3.0.0.
|
||||
if not isinstance(username, basestring):
|
||||
warnings.warn(
|
||||
"Non-string usernames will no longer be supported in Requests "
|
||||
"3.0.0. Please convert the object you've passed in ({!r}) to "
|
||||
"a string or bytes object in the near future to avoid "
|
||||
"problems.".format(username),
|
||||
category=DeprecationWarning,
|
||||
)
|
||||
username = str(username)
|
||||
|
||||
if not isinstance(password, basestring):
|
||||
warnings.warn(
|
||||
"Non-string passwords will no longer be supported in Requests "
|
||||
"3.0.0. Please convert the object you've passed in ({!r}) to "
|
||||
"a string or bytes object in the near future to avoid "
|
||||
"problems.".format(type(password)),
|
||||
category=DeprecationWarning,
|
||||
)
|
||||
password = str(password)
|
||||
# -- End Removal --
|
||||
|
||||
if isinstance(username, str):
|
||||
username = username.encode("latin1")
|
||||
|
||||
if isinstance(password, str):
|
||||
password = password.encode("latin1")
|
||||
|
||||
authstr = "Basic " + to_native_string(
|
||||
b64encode(b":".join((username, password))).strip()
|
||||
)
|
||||
|
||||
return authstr
|
||||
|
||||
|
||||
class AuthBase:
|
||||
"""Base class that all auth implementations derive from"""
|
||||
|
||||
def __call__(self, r):
|
||||
raise NotImplementedError("Auth hooks must be callable.")
|
||||
|
||||
|
||||
class HTTPBasicAuth(AuthBase):
|
||||
"""Attaches HTTP Basic Authentication to the given Request object."""
|
||||
|
||||
def __init__(self, username, password):
|
||||
self.username = username
|
||||
self.password = password
|
||||
|
||||
def __eq__(self, other):
|
||||
return all(
|
||||
[
|
||||
self.username == getattr(other, "username", None),
|
||||
self.password == getattr(other, "password", None),
|
||||
]
|
||||
)
|
||||
|
||||
def __ne__(self, other):
|
||||
return not self == other
|
||||
|
||||
def __call__(self, r):
|
||||
r.headers["Authorization"] = _basic_auth_str(self.username, self.password)
|
||||
return r
|
||||
|
||||
|
||||
class HTTPProxyAuth(HTTPBasicAuth):
|
||||
"""Attaches HTTP Proxy Authentication to a given Request object."""
|
||||
|
||||
def __call__(self, r):
|
||||
r.headers["Proxy-Authorization"] = _basic_auth_str(self.username, self.password)
|
||||
return r
|
||||
|
||||
|
||||
class HTTPDigestAuth(AuthBase):
|
||||
"""Attaches HTTP Digest Authentication to the given Request object."""
|
||||
|
||||
def __init__(self, username, password):
|
||||
self.username = username
|
||||
self.password = password
|
||||
# Keep state in per-thread local storage
|
||||
self._thread_local = threading.local()
|
||||
|
||||
def init_per_thread_state(self):
|
||||
# Ensure state is initialized just once per-thread
|
||||
if not hasattr(self._thread_local, "init"):
|
||||
self._thread_local.init = True
|
||||
self._thread_local.last_nonce = ""
|
||||
self._thread_local.nonce_count = 0
|
||||
self._thread_local.chal = {}
|
||||
self._thread_local.pos = None
|
||||
self._thread_local.num_401_calls = None
|
||||
|
||||
def build_digest_header(self, method, url):
|
||||
"""
|
||||
:rtype: str
|
||||
"""
|
||||
|
||||
realm = self._thread_local.chal["realm"]
|
||||
nonce = self._thread_local.chal["nonce"]
|
||||
qop = self._thread_local.chal.get("qop")
|
||||
algorithm = self._thread_local.chal.get("algorithm")
|
||||
opaque = self._thread_local.chal.get("opaque")
|
||||
hash_utf8 = None
|
||||
|
||||
if algorithm is None:
|
||||
_algorithm = "MD5"
|
||||
else:
|
||||
_algorithm = algorithm.upper()
|
||||
# lambdas assume digest modules are imported at the top level
|
||||
if _algorithm == "MD5" or _algorithm == "MD5-SESS":
|
||||
|
||||
def md5_utf8(x):
|
||||
if isinstance(x, str):
|
||||
x = x.encode("utf-8")
|
||||
return hashlib.md5(x).hexdigest()
|
||||
|
||||
hash_utf8 = md5_utf8
|
||||
elif _algorithm == "SHA":
|
||||
|
||||
def sha_utf8(x):
|
||||
if isinstance(x, str):
|
||||
x = x.encode("utf-8")
|
||||
return hashlib.sha1(x).hexdigest()
|
||||
|
||||
hash_utf8 = sha_utf8
|
||||
elif _algorithm == "SHA-256":
|
||||
|
||||
def sha256_utf8(x):
|
||||
if isinstance(x, str):
|
||||
x = x.encode("utf-8")
|
||||
return hashlib.sha256(x).hexdigest()
|
||||
|
||||
hash_utf8 = sha256_utf8
|
||||
elif _algorithm == "SHA-512":
|
||||
|
||||
def sha512_utf8(x):
|
||||
if isinstance(x, str):
|
||||
x = x.encode("utf-8")
|
||||
return hashlib.sha512(x).hexdigest()
|
||||
|
||||
hash_utf8 = sha512_utf8
|
||||
|
||||
KD = lambda s, d: hash_utf8(f"{s}:{d}") # noqa:E731
|
||||
|
||||
if hash_utf8 is None:
|
||||
return None
|
||||
|
||||
# XXX not implemented yet
|
||||
entdig = None
|
||||
p_parsed = urlparse(url)
|
||||
#: path is request-uri defined in RFC 2616 which should not be empty
|
||||
path = p_parsed.path or "/"
|
||||
if p_parsed.query:
|
||||
path += f"?{p_parsed.query}"
|
||||
|
||||
A1 = f"{self.username}:{realm}:{self.password}"
|
||||
A2 = f"{method}:{path}"
|
||||
|
||||
HA1 = hash_utf8(A1)
|
||||
HA2 = hash_utf8(A2)
|
||||
|
||||
if nonce == self._thread_local.last_nonce:
|
||||
self._thread_local.nonce_count += 1
|
||||
else:
|
||||
self._thread_local.nonce_count = 1
|
||||
ncvalue = f"{self._thread_local.nonce_count:08x}"
|
||||
s = str(self._thread_local.nonce_count).encode("utf-8")
|
||||
s += nonce.encode("utf-8")
|
||||
s += time.ctime().encode("utf-8")
|
||||
s += os.urandom(8)
|
||||
|
||||
cnonce = hashlib.sha1(s).hexdigest()[:16]
|
||||
if _algorithm == "MD5-SESS":
|
||||
HA1 = hash_utf8(f"{HA1}:{nonce}:{cnonce}")
|
||||
|
||||
if not qop:
|
||||
respdig = KD(HA1, f"{nonce}:{HA2}")
|
||||
elif qop == "auth" or "auth" in qop.split(","):
|
||||
noncebit = f"{nonce}:{ncvalue}:{cnonce}:auth:{HA2}"
|
||||
respdig = KD(HA1, noncebit)
|
||||
else:
|
||||
# XXX handle auth-int.
|
||||
return None
|
||||
|
||||
self._thread_local.last_nonce = nonce
|
||||
|
||||
# XXX should the partial digests be encoded too?
|
||||
base = (
|
||||
f'username="{self.username}", realm="{realm}", nonce="{nonce}", '
|
||||
f'uri="{path}", response="{respdig}"'
|
||||
)
|
||||
if opaque:
|
||||
base += f', opaque="{opaque}"'
|
||||
if algorithm:
|
||||
base += f', algorithm="{algorithm}"'
|
||||
if entdig:
|
||||
base += f', digest="{entdig}"'
|
||||
if qop:
|
||||
base += f', qop="auth", nc={ncvalue}, cnonce="{cnonce}"'
|
||||
|
||||
return f"Digest {base}"
|
||||
|
||||
def handle_redirect(self, r, **kwargs):
|
||||
"""Reset num_401_calls counter on redirects."""
|
||||
if r.is_redirect:
|
||||
self._thread_local.num_401_calls = 1
|
||||
|
||||
def handle_401(self, r, **kwargs):
|
||||
"""
|
||||
Takes the given response and tries digest-auth, if needed.
|
||||
|
||||
:rtype: requests.Response
|
||||
"""
|
||||
|
||||
# If response is not 4xx, do not auth
|
||||
# See https://github.com/psf/requests/issues/3772
|
||||
if not 400 <= r.status_code < 500:
|
||||
self._thread_local.num_401_calls = 1
|
||||
return r
|
||||
|
||||
if self._thread_local.pos is not None:
|
||||
# Rewind the file position indicator of the body to where
|
||||
# it was to resend the request.
|
||||
r.request.body.seek(self._thread_local.pos)
|
||||
s_auth = r.headers.get("www-authenticate", "")
|
||||
|
||||
if "digest" in s_auth.lower() and self._thread_local.num_401_calls < 2:
|
||||
|
||||
self._thread_local.num_401_calls += 1
|
||||
pat = re.compile(r"digest ", flags=re.IGNORECASE)
|
||||
self._thread_local.chal = parse_dict_header(pat.sub("", s_auth, count=1))
|
||||
|
||||
# Consume content and release the original connection
|
||||
# to allow our new request to reuse the same one.
|
||||
r.content
|
||||
r.close()
|
||||
prep = r.request.copy()
|
||||
extract_cookies_to_jar(prep._cookies, r.request, r.raw)
|
||||
prep.prepare_cookies(prep._cookies)
|
||||
|
||||
prep.headers["Authorization"] = self.build_digest_header(
|
||||
prep.method, prep.url
|
||||
)
|
||||
_r = r.connection.send(prep, **kwargs)
|
||||
_r.history.append(r)
|
||||
_r.request = prep
|
||||
|
||||
return _r
|
||||
|
||||
self._thread_local.num_401_calls = 1
|
||||
return r
|
||||
|
||||
def __call__(self, r):
|
||||
# Initialize per-thread state, if needed
|
||||
self.init_per_thread_state()
|
||||
# If we have a saved nonce, skip the 401
|
||||
if self._thread_local.last_nonce:
|
||||
r.headers["Authorization"] = self.build_digest_header(r.method, r.url)
|
||||
try:
|
||||
self._thread_local.pos = r.body.tell()
|
||||
except AttributeError:
|
||||
# In the case of HTTPDigestAuth being reused and the body of
|
||||
# the previous request was a file-like object, pos has the
|
||||
# file position of the previous body. Ensure it's set to
|
||||
# None.
|
||||
self._thread_local.pos = None
|
||||
r.register_hook("response", self.handle_401)
|
||||
r.register_hook("response", self.handle_redirect)
|
||||
self._thread_local.num_401_calls = 1
|
||||
|
||||
return r
|
||||
|
||||
def __eq__(self, other):
|
||||
return all(
|
||||
[
|
||||
self.username == getattr(other, "username", None),
|
||||
self.password == getattr(other, "password", None),
|
||||
]
|
||||
)
|
||||
|
||||
def __ne__(self, other):
|
||||
return not self == other
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user