mirror of
https://gitflic.ru/project/photopea-v2/photopea-v-2.git
synced 2026-04-02 22:16:21 +00:00
Mirror "Templates" and "Plugins" community pages
This commit is contained in:
33
Updater.py
33
Updater.py
@@ -1,9 +1,12 @@
|
||||
#!/bin/python3
|
||||
|
||||
import sys
|
||||
sys.path.insert(0,"_vendor")
|
||||
|
||||
import requests
|
||||
import os, sys
|
||||
import os
|
||||
import re
|
||||
import json
|
||||
sys.path.insert(0,"_vendor")
|
||||
from tqdm import tqdm
|
||||
from dataclasses import dataclass
|
||||
import glob
|
||||
@@ -30,13 +33,29 @@ urls = [
|
||||
"code/storages/deviceStorage.html",
|
||||
"code/storages/googledriveStorage.html",
|
||||
"code/storages/dropboxStorage.html",
|
||||
"rsrc/basic/fa_basic.csh"
|
||||
"rsrc/basic/fa_basic.csh",
|
||||
"img/nft.png",
|
||||
["templates/?type=0&rsrc=","templates/?type=0.html"],
|
||||
["templates/?type=1&rsrc=","templates/?type=1.html"],
|
||||
["templates/?type=2&rsrc=","templates/?type=2.html"],
|
||||
["templates/?type=3&rsrc=","templates/?type=3.html"],
|
||||
"templates/templates.js",
|
||||
"templates/templates.css"
|
||||
]
|
||||
|
||||
|
||||
|
||||
#Update files
|
||||
def dl_file(path):
|
||||
if isinstance(path,list):
|
||||
output=path[1]
|
||||
path=path[0]
|
||||
else:
|
||||
output=path
|
||||
path=path
|
||||
outfn = root + output
|
||||
if os.path.exists(outfn):
|
||||
return
|
||||
with tqdm(desc=path, unit="B", unit_scale=True) as progress_bar:
|
||||
r = requests.get(website + path, stream=True)
|
||||
progress_bar.total = int(r.headers.get("Content-Length", 0))
|
||||
@@ -44,8 +63,8 @@ def dl_file(path):
|
||||
if r.status_code != 200:
|
||||
progress_bar.desc += "ERROR: HTTP Status %d" % r.status_code
|
||||
return
|
||||
|
||||
outfn = root + path
|
||||
|
||||
|
||||
os.makedirs(os.path.dirname(outfn), exist_ok=True)
|
||||
with open(outfn, "wb") as outf:
|
||||
for chunk in r.iter_content(chunk_size=1024):
|
||||
@@ -145,3 +164,7 @@ find_and_replace('code/storages/dropboxStorage.html', 'var redirectUri = window.
|
||||
find_and_replace('index.html','https://connect.facebook.net','')
|
||||
|
||||
find_and_replace('index.html','https://www.facebook.com','')
|
||||
|
||||
#Redirect dynamic pages to static equivalent
|
||||
find_and_replace('code/pp/pp.js','"&rsrc="','".html"')
|
||||
find_and_replace('code/pp/pp.js','"templates/?type="','"templates/%3Ftype="')
|
||||
|
||||
1
_vendor/certifi-2022.9.24.dist-info/INSTALLER
Normal file
1
_vendor/certifi-2022.9.24.dist-info/INSTALLER
Normal file
@@ -0,0 +1 @@
|
||||
pip
|
||||
21
_vendor/certifi-2022.9.24.dist-info/LICENSE
Normal file
21
_vendor/certifi-2022.9.24.dist-info/LICENSE
Normal file
@@ -0,0 +1,21 @@
|
||||
This package contains a modified version of ca-bundle.crt:
|
||||
|
||||
ca-bundle.crt -- Bundle of CA Root Certificates
|
||||
|
||||
Certificate data from Mozilla as of: Thu Nov 3 19:04:19 2011#
|
||||
This is a bundle of X.509 certificates of public Certificate Authorities
|
||||
(CA). These were automatically extracted from Mozilla's root certificates
|
||||
file (certdata.txt). This file can be found in the mozilla source tree:
|
||||
https://hg.mozilla.org/mozilla-central/file/tip/security/nss/lib/ckfw/builtins/certdata.txt
|
||||
It contains the certificates in PEM format and therefore
|
||||
can be directly used with curl / libcurl / php_curl, or with
|
||||
an Apache+mod_ssl webserver for SSL client authentication.
|
||||
Just configure this file as the SSLCACertificateFile.#
|
||||
|
||||
***** BEGIN LICENSE BLOCK *****
|
||||
This Source Code Form is subject to the terms of the Mozilla Public License,
|
||||
v. 2.0. If a copy of the MPL was not distributed with this file, You can obtain
|
||||
one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
***** END LICENSE BLOCK *****
|
||||
@(#) $RCSfile: certdata.txt,v $ $Revision: 1.80 $ $Date: 2011/11/03 15:11:58 $
|
||||
83
_vendor/certifi-2022.9.24.dist-info/METADATA
Normal file
83
_vendor/certifi-2022.9.24.dist-info/METADATA
Normal file
@@ -0,0 +1,83 @@
|
||||
Metadata-Version: 2.1
|
||||
Name: certifi
|
||||
Version: 2022.9.24
|
||||
Summary: Python package for providing Mozilla's CA Bundle.
|
||||
Home-page: https://github.com/certifi/python-certifi
|
||||
Author: Kenneth Reitz
|
||||
Author-email: me@kennethreitz.com
|
||||
License: MPL-2.0
|
||||
Project-URL: Source, https://github.com/certifi/python-certifi
|
||||
Platform: UNKNOWN
|
||||
Classifier: Development Status :: 5 - Production/Stable
|
||||
Classifier: Intended Audience :: Developers
|
||||
Classifier: License :: OSI Approved :: Mozilla Public License 2.0 (MPL 2.0)
|
||||
Classifier: Natural Language :: English
|
||||
Classifier: Programming Language :: Python
|
||||
Classifier: Programming Language :: Python :: 3
|
||||
Classifier: Programming Language :: Python :: 3 :: Only
|
||||
Classifier: Programming Language :: Python :: 3.6
|
||||
Classifier: Programming Language :: Python :: 3.7
|
||||
Classifier: Programming Language :: Python :: 3.8
|
||||
Classifier: Programming Language :: Python :: 3.9
|
||||
Classifier: Programming Language :: Python :: 3.10
|
||||
Classifier: Programming Language :: Python :: 3.11
|
||||
Requires-Python: >=3.6
|
||||
License-File: LICENSE
|
||||
|
||||
Certifi: Python SSL Certificates
|
||||
================================
|
||||
|
||||
Certifi provides Mozilla's carefully curated collection of Root Certificates for
|
||||
validating the trustworthiness of SSL certificates while verifying the identity
|
||||
of TLS hosts. It has been extracted from the `Requests`_ project.
|
||||
|
||||
Installation
|
||||
------------
|
||||
|
||||
``certifi`` is available on PyPI. Simply install it with ``pip``::
|
||||
|
||||
$ pip install certifi
|
||||
|
||||
Usage
|
||||
-----
|
||||
|
||||
To reference the installed certificate authority (CA) bundle, you can use the
|
||||
built-in function::
|
||||
|
||||
>>> import certifi
|
||||
|
||||
>>> certifi.where()
|
||||
'/usr/local/lib/python3.7/site-packages/certifi/cacert.pem'
|
||||
|
||||
Or from the command line::
|
||||
|
||||
$ python -m certifi
|
||||
/usr/local/lib/python3.7/site-packages/certifi/cacert.pem
|
||||
|
||||
Enjoy!
|
||||
|
||||
1024-bit Root Certificates
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
Browsers and certificate authorities have concluded that 1024-bit keys are
|
||||
unacceptably weak for certificates, particularly root certificates. For this
|
||||
reason, Mozilla has removed any weak (i.e. 1024-bit key) certificate from its
|
||||
bundle, replacing it with an equivalent strong (i.e. 2048-bit or greater key)
|
||||
certificate from the same CA. Because Mozilla removed these certificates from
|
||||
its bundle, ``certifi`` removed them as well.
|
||||
|
||||
In previous versions, ``certifi`` provided the ``certifi.old_where()`` function
|
||||
to intentionally re-add the 1024-bit roots back into your bundle. This was not
|
||||
recommended in production and therefore was removed at the end of 2018.
|
||||
|
||||
.. _`Requests`: https://requests.readthedocs.io/en/master/
|
||||
|
||||
Addition/Removal of Certificates
|
||||
--------------------------------
|
||||
|
||||
Certifi does not support any addition/removal or other modification of the
|
||||
CA trust store content. This project is intended to provide a reliable and
|
||||
highly portable root of trust to python deployments. Look to upstream projects
|
||||
for methods to use alternate trust.
|
||||
|
||||
|
||||
14
_vendor/certifi-2022.9.24.dist-info/RECORD
Normal file
14
_vendor/certifi-2022.9.24.dist-info/RECORD
Normal file
@@ -0,0 +1,14 @@
|
||||
certifi-2022.9.24.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
|
||||
certifi-2022.9.24.dist-info/LICENSE,sha256=oC9sY4-fuE0G93ZMOrCF2K9-2luTwWbaVDEkeQd8b7A,1052
|
||||
certifi-2022.9.24.dist-info/METADATA,sha256=33NAOmkqKTCb2u1Ys8Zth7ABWXfEuLgp-5gLp1yK_7A,2911
|
||||
certifi-2022.9.24.dist-info/RECORD,,
|
||||
certifi-2022.9.24.dist-info/WHEEL,sha256=ewwEueio1C2XeHTvT17n8dZUJgOvyCWCt0WVNLClP9o,92
|
||||
certifi-2022.9.24.dist-info/top_level.txt,sha256=KMu4vUCfsjLrkPbSNdgdekS-pVJzBAJFO__nI8NF6-U,8
|
||||
certifi/__init__.py,sha256=luDjIGxDSrQ9O0zthdz5Lnt069Z_7eR1GIEefEaf-Ys,94
|
||||
certifi/__main__.py,sha256=xBBoj905TUWBLRGANOcf7oi6e-3dMP4cEoG9OyMs11g,243
|
||||
certifi/__pycache__/__init__.cpython-310.pyc,,
|
||||
certifi/__pycache__/__main__.cpython-310.pyc,,
|
||||
certifi/__pycache__/core.cpython-310.pyc,,
|
||||
certifi/cacert.pem,sha256=3l8CcWt_qL42030rGieD3SLufICFX0bYtGhDl_EXVPI,286370
|
||||
certifi/core.py,sha256=lhewz0zFb2b4ULsQurElmloYwQoecjWzPqY67P8T7iM,4219
|
||||
certifi/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
||||
5
_vendor/certifi-2022.9.24.dist-info/WHEEL
Normal file
5
_vendor/certifi-2022.9.24.dist-info/WHEEL
Normal file
@@ -0,0 +1,5 @@
|
||||
Wheel-Version: 1.0
|
||||
Generator: bdist_wheel (0.37.0)
|
||||
Root-Is-Purelib: true
|
||||
Tag: py3-none-any
|
||||
|
||||
1
_vendor/certifi-2022.9.24.dist-info/top_level.txt
Normal file
1
_vendor/certifi-2022.9.24.dist-info/top_level.txt
Normal file
@@ -0,0 +1 @@
|
||||
certifi
|
||||
4
_vendor/certifi/__init__.py
Normal file
4
_vendor/certifi/__init__.py
Normal file
@@ -0,0 +1,4 @@
|
||||
from .core import contents, where
|
||||
|
||||
__all__ = ["contents", "where"]
|
||||
__version__ = "2022.09.24"
|
||||
12
_vendor/certifi/__main__.py
Normal file
12
_vendor/certifi/__main__.py
Normal file
@@ -0,0 +1,12 @@
|
||||
import argparse
|
||||
|
||||
from certifi import contents, where
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("-c", "--contents", action="store_true")
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.contents:
|
||||
print(contents())
|
||||
else:
|
||||
print(where())
|
||||
BIN
_vendor/certifi/__pycache__/__init__.cpython-310.pyc
Normal file
BIN
_vendor/certifi/__pycache__/__init__.cpython-310.pyc
Normal file
Binary file not shown.
BIN
_vendor/certifi/__pycache__/__main__.cpython-310.pyc
Normal file
BIN
_vendor/certifi/__pycache__/__main__.cpython-310.pyc
Normal file
Binary file not shown.
BIN
_vendor/certifi/__pycache__/core.cpython-310.pyc
Normal file
BIN
_vendor/certifi/__pycache__/core.cpython-310.pyc
Normal file
Binary file not shown.
4708
_vendor/certifi/cacert.pem
Normal file
4708
_vendor/certifi/cacert.pem
Normal file
File diff suppressed because it is too large
Load Diff
108
_vendor/certifi/core.py
Normal file
108
_vendor/certifi/core.py
Normal file
@@ -0,0 +1,108 @@
|
||||
"""
|
||||
certifi.py
|
||||
~~~~~~~~~~
|
||||
|
||||
This module returns the installation location of cacert.pem or its contents.
|
||||
"""
|
||||
import sys
|
||||
|
||||
|
||||
if sys.version_info >= (3, 11):
|
||||
|
||||
from importlib.resources import as_file, files
|
||||
|
||||
_CACERT_CTX = None
|
||||
_CACERT_PATH = None
|
||||
|
||||
def where() -> str:
|
||||
# This is slightly terrible, but we want to delay extracting the file
|
||||
# in cases where we're inside of a zipimport situation until someone
|
||||
# actually calls where(), but we don't want to re-extract the file
|
||||
# on every call of where(), so we'll do it once then store it in a
|
||||
# global variable.
|
||||
global _CACERT_CTX
|
||||
global _CACERT_PATH
|
||||
if _CACERT_PATH is None:
|
||||
# This is slightly janky, the importlib.resources API wants you to
|
||||
# manage the cleanup of this file, so it doesn't actually return a
|
||||
# path, it returns a context manager that will give you the path
|
||||
# when you enter it and will do any cleanup when you leave it. In
|
||||
# the common case of not needing a temporary file, it will just
|
||||
# return the file system location and the __exit__() is a no-op.
|
||||
#
|
||||
# We also have to hold onto the actual context manager, because
|
||||
# it will do the cleanup whenever it gets garbage collected, so
|
||||
# we will also store that at the global level as well.
|
||||
_CACERT_CTX = as_file(files("certifi").joinpath("cacert.pem"))
|
||||
_CACERT_PATH = str(_CACERT_CTX.__enter__())
|
||||
|
||||
return _CACERT_PATH
|
||||
|
||||
def contents() -> str:
|
||||
return files("certifi").joinpath("cacert.pem").read_text(encoding="ascii")
|
||||
|
||||
elif sys.version_info >= (3, 7):
|
||||
|
||||
from importlib.resources import path as get_path, read_text
|
||||
|
||||
_CACERT_CTX = None
|
||||
_CACERT_PATH = None
|
||||
|
||||
def where() -> str:
|
||||
# This is slightly terrible, but we want to delay extracting the
|
||||
# file in cases where we're inside of a zipimport situation until
|
||||
# someone actually calls where(), but we don't want to re-extract
|
||||
# the file on every call of where(), so we'll do it once then store
|
||||
# it in a global variable.
|
||||
global _CACERT_CTX
|
||||
global _CACERT_PATH
|
||||
if _CACERT_PATH is None:
|
||||
# This is slightly janky, the importlib.resources API wants you
|
||||
# to manage the cleanup of this file, so it doesn't actually
|
||||
# return a path, it returns a context manager that will give
|
||||
# you the path when you enter it and will do any cleanup when
|
||||
# you leave it. In the common case of not needing a temporary
|
||||
# file, it will just return the file system location and the
|
||||
# __exit__() is a no-op.
|
||||
#
|
||||
# We also have to hold onto the actual context manager, because
|
||||
# it will do the cleanup whenever it gets garbage collected, so
|
||||
# we will also store that at the global level as well.
|
||||
_CACERT_CTX = get_path("certifi", "cacert.pem")
|
||||
_CACERT_PATH = str(_CACERT_CTX.__enter__())
|
||||
|
||||
return _CACERT_PATH
|
||||
|
||||
def contents() -> str:
|
||||
return read_text("certifi", "cacert.pem", encoding="ascii")
|
||||
|
||||
else:
|
||||
import os
|
||||
import types
|
||||
from typing import Union
|
||||
|
||||
Package = Union[types.ModuleType, str]
|
||||
Resource = Union[str, "os.PathLike"]
|
||||
|
||||
# This fallback will work for Python versions prior to 3.7 that lack the
|
||||
# importlib.resources module but relies on the existing `where` function
|
||||
# so won't address issues with environments like PyOxidizer that don't set
|
||||
# __file__ on modules.
|
||||
def read_text(
|
||||
package: Package,
|
||||
resource: Resource,
|
||||
encoding: str = 'utf-8',
|
||||
errors: str = 'strict'
|
||||
) -> str:
|
||||
with open(where(), encoding=encoding) as data:
|
||||
return data.read()
|
||||
|
||||
# If we don't have importlib.resources, then we will just do the old logic
|
||||
# of assuming we're on the filesystem and munge the path directly.
|
||||
def where() -> str:
|
||||
f = os.path.dirname(__file__)
|
||||
|
||||
return os.path.join(f, "cacert.pem")
|
||||
|
||||
def contents() -> str:
|
||||
return read_text("certifi", "cacert.pem", encoding="ascii")
|
||||
0
_vendor/certifi/py.typed
Normal file
0
_vendor/certifi/py.typed
Normal file
1
_vendor/charset_normalizer-2.1.1.dist-info/INSTALLER
Normal file
1
_vendor/charset_normalizer-2.1.1.dist-info/INSTALLER
Normal file
@@ -0,0 +1 @@
|
||||
pip
|
||||
21
_vendor/charset_normalizer-2.1.1.dist-info/LICENSE
Normal file
21
_vendor/charset_normalizer-2.1.1.dist-info/LICENSE
Normal file
@@ -0,0 +1,21 @@
|
||||
MIT License
|
||||
|
||||
Copyright (c) 2019 TAHRI Ahmed R.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
||||
269
_vendor/charset_normalizer-2.1.1.dist-info/METADATA
Normal file
269
_vendor/charset_normalizer-2.1.1.dist-info/METADATA
Normal file
@@ -0,0 +1,269 @@
|
||||
Metadata-Version: 2.1
|
||||
Name: charset-normalizer
|
||||
Version: 2.1.1
|
||||
Summary: The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet.
|
||||
Home-page: https://github.com/ousret/charset_normalizer
|
||||
Author: Ahmed TAHRI @Ousret
|
||||
Author-email: ahmed.tahri@cloudnursery.dev
|
||||
License: MIT
|
||||
Project-URL: Bug Reports, https://github.com/Ousret/charset_normalizer/issues
|
||||
Project-URL: Documentation, https://charset-normalizer.readthedocs.io/en/latest
|
||||
Keywords: encoding,i18n,txt,text,charset,charset-detector,normalization,unicode,chardet
|
||||
Classifier: Development Status :: 5 - Production/Stable
|
||||
Classifier: License :: OSI Approved :: MIT License
|
||||
Classifier: Intended Audience :: Developers
|
||||
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
||||
Classifier: Operating System :: OS Independent
|
||||
Classifier: Programming Language :: Python
|
||||
Classifier: Programming Language :: Python :: 3
|
||||
Classifier: Programming Language :: Python :: 3.6
|
||||
Classifier: Programming Language :: Python :: 3.7
|
||||
Classifier: Programming Language :: Python :: 3.8
|
||||
Classifier: Programming Language :: Python :: 3.9
|
||||
Classifier: Programming Language :: Python :: 3.10
|
||||
Classifier: Programming Language :: Python :: 3.11
|
||||
Classifier: Topic :: Text Processing :: Linguistic
|
||||
Classifier: Topic :: Utilities
|
||||
Classifier: Programming Language :: Python :: Implementation :: PyPy
|
||||
Classifier: Typing :: Typed
|
||||
Requires-Python: >=3.6.0
|
||||
Description-Content-Type: text/markdown
|
||||
License-File: LICENSE
|
||||
Provides-Extra: unicode_backport
|
||||
Requires-Dist: unicodedata2 ; extra == 'unicode_backport'
|
||||
|
||||
|
||||
<h1 align="center">Charset Detection, for Everyone 👋 <a href="https://twitter.com/intent/tweet?text=The%20Real%20First%20Universal%20Charset%20%26%20Language%20Detector&url=https://www.github.com/Ousret/charset_normalizer&hashtags=python,encoding,chardet,developers"><img src="https://img.shields.io/twitter/url/http/shields.io.svg?style=social"/></a></h1>
|
||||
|
||||
<p align="center">
|
||||
<sup>The Real First Universal Charset Detector</sup><br>
|
||||
<a href="https://pypi.org/project/charset-normalizer">
|
||||
<img src="https://img.shields.io/pypi/pyversions/charset_normalizer.svg?orange=blue" />
|
||||
</a>
|
||||
<a href="https://codecov.io/gh/Ousret/charset_normalizer">
|
||||
<img src="https://codecov.io/gh/Ousret/charset_normalizer/branch/master/graph/badge.svg" />
|
||||
</a>
|
||||
<a href="https://pepy.tech/project/charset-normalizer/">
|
||||
<img alt="Download Count Total" src="https://pepy.tech/badge/charset-normalizer/month" />
|
||||
</a>
|
||||
</p>
|
||||
|
||||
> A library that helps you read text from an unknown charset encoding.<br /> Motivated by `chardet`,
|
||||
> I'm trying to resolve the issue by taking a new approach.
|
||||
> All IANA character set names for which the Python core library provides codecs are supported.
|
||||
|
||||
<p align="center">
|
||||
>>>>> <a href="https://charsetnormalizerweb.ousret.now.sh" target="_blank">👉 Try Me Online Now, Then Adopt Me 👈 </a> <<<<<
|
||||
</p>
|
||||
|
||||
This project offers you an alternative to **Universal Charset Encoding Detector**, also known as **Chardet**.
|
||||
|
||||
| Feature | [Chardet](https://github.com/chardet/chardet) | Charset Normalizer | [cChardet](https://github.com/PyYoshi/cChardet) |
|
||||
| ------------- | :-------------: | :------------------: | :------------------: |
|
||||
| `Fast` | ❌<br> | ✅<br> | ✅ <br> |
|
||||
| `Universal**` | ❌ | ✅ | ❌ |
|
||||
| `Reliable` **without** distinguishable standards | ❌ | ✅ | ✅ |
|
||||
| `Reliable` **with** distinguishable standards | ✅ | ✅ | ✅ |
|
||||
| `License` | LGPL-2.1<br>_restrictive_ | MIT | MPL-1.1<br>_restrictive_ |
|
||||
| `Native Python` | ✅ | ✅ | ❌ |
|
||||
| `Detect spoken language` | ❌ | ✅ | N/A |
|
||||
| `UnicodeDecodeError Safety` | ❌ | ✅ | ❌ |
|
||||
| `Whl Size` | 193.6 kB | 39.5 kB | ~200 kB |
|
||||
| `Supported Encoding` | 33 | :tada: [93](https://charset-normalizer.readthedocs.io/en/latest/user/support.html#supported-encodings) | 40
|
||||
|
||||
<p align="center">
|
||||
<img src="https://i.imgflip.com/373iay.gif" alt="Reading Normalized Text" width="226"/><img src="https://media.tenor.com/images/c0180f70732a18b4965448d33adba3d0/tenor.gif" alt="Cat Reading Text" width="200"/>
|
||||
|
||||
*\*\* : They are clearly using specific code for a specific encoding even if covering most of used one*<br>
|
||||
Did you got there because of the logs? See [https://charset-normalizer.readthedocs.io/en/latest/user/miscellaneous.html](https://charset-normalizer.readthedocs.io/en/latest/user/miscellaneous.html)
|
||||
|
||||
## ⭐ Your support
|
||||
|
||||
*Fork, test-it, star-it, submit your ideas! We do listen.*
|
||||
|
||||
## ⚡ Performance
|
||||
|
||||
This package offer better performance than its counterpart Chardet. Here are some numbers.
|
||||
|
||||
| Package | Accuracy | Mean per file (ms) | File per sec (est) |
|
||||
| ------------- | :-------------: | :------------------: | :------------------: |
|
||||
| [chardet](https://github.com/chardet/chardet) | 86 % | 200 ms | 5 file/sec |
|
||||
| charset-normalizer | **98 %** | **39 ms** | 26 file/sec |
|
||||
|
||||
| Package | 99th percentile | 95th percentile | 50th percentile |
|
||||
| ------------- | :-------------: | :------------------: | :------------------: |
|
||||
| [chardet](https://github.com/chardet/chardet) | 1200 ms | 287 ms | 23 ms |
|
||||
| charset-normalizer | 400 ms | 200 ms | 15 ms |
|
||||
|
||||
Chardet's performance on larger file (1MB+) are very poor. Expect huge difference on large payload.
|
||||
|
||||
> Stats are generated using 400+ files using default parameters. More details on used files, see GHA workflows.
|
||||
> And yes, these results might change at any time. The dataset can be updated to include more files.
|
||||
> The actual delays heavily depends on your CPU capabilities. The factors should remain the same.
|
||||
> Keep in mind that the stats are generous and that Chardet accuracy vs our is measured using Chardet initial capability
|
||||
> (eg. Supported Encoding) Challenge-them if you want.
|
||||
|
||||
[cchardet](https://github.com/PyYoshi/cChardet) is a non-native (cpp binding) and unmaintained faster alternative with
|
||||
a better accuracy than chardet but lower than this package. If speed is the most important factor, you should try it.
|
||||
|
||||
## ✨ Installation
|
||||
|
||||
Using PyPi for latest stable
|
||||
```sh
|
||||
pip install charset-normalizer -U
|
||||
```
|
||||
|
||||
If you want a more up-to-date `unicodedata` than the one available in your Python setup.
|
||||
```sh
|
||||
pip install charset-normalizer[unicode_backport] -U
|
||||
```
|
||||
|
||||
## 🚀 Basic Usage
|
||||
|
||||
### CLI
|
||||
This package comes with a CLI.
|
||||
|
||||
```
|
||||
usage: normalizer [-h] [-v] [-a] [-n] [-m] [-r] [-f] [-t THRESHOLD]
|
||||
file [file ...]
|
||||
|
||||
The Real First Universal Charset Detector. Discover originating encoding used
|
||||
on text file. Normalize text to unicode.
|
||||
|
||||
positional arguments:
|
||||
files File(s) to be analysed
|
||||
|
||||
optional arguments:
|
||||
-h, --help show this help message and exit
|
||||
-v, --verbose Display complementary information about file if any.
|
||||
Stdout will contain logs about the detection process.
|
||||
-a, --with-alternative
|
||||
Output complementary possibilities if any. Top-level
|
||||
JSON WILL be a list.
|
||||
-n, --normalize Permit to normalize input file. If not set, program
|
||||
does not write anything.
|
||||
-m, --minimal Only output the charset detected to STDOUT. Disabling
|
||||
JSON output.
|
||||
-r, --replace Replace file when trying to normalize it instead of
|
||||
creating a new one.
|
||||
-f, --force Replace file without asking if you are sure, use this
|
||||
flag with caution.
|
||||
-t THRESHOLD, --threshold THRESHOLD
|
||||
Define a custom maximum amount of chaos allowed in
|
||||
decoded content. 0. <= chaos <= 1.
|
||||
--version Show version information and exit.
|
||||
```
|
||||
|
||||
```bash
|
||||
normalizer ./data/sample.1.fr.srt
|
||||
```
|
||||
|
||||
:tada: Since version 1.4.0 the CLI produce easily usable stdout result in JSON format.
|
||||
|
||||
```json
|
||||
{
|
||||
"path": "/home/default/projects/charset_normalizer/data/sample.1.fr.srt",
|
||||
"encoding": "cp1252",
|
||||
"encoding_aliases": [
|
||||
"1252",
|
||||
"windows_1252"
|
||||
],
|
||||
"alternative_encodings": [
|
||||
"cp1254",
|
||||
"cp1256",
|
||||
"cp1258",
|
||||
"iso8859_14",
|
||||
"iso8859_15",
|
||||
"iso8859_16",
|
||||
"iso8859_3",
|
||||
"iso8859_9",
|
||||
"latin_1",
|
||||
"mbcs"
|
||||
],
|
||||
"language": "French",
|
||||
"alphabets": [
|
||||
"Basic Latin",
|
||||
"Latin-1 Supplement"
|
||||
],
|
||||
"has_sig_or_bom": false,
|
||||
"chaos": 0.149,
|
||||
"coherence": 97.152,
|
||||
"unicode_path": null,
|
||||
"is_preferred": true
|
||||
}
|
||||
```
|
||||
|
||||
### Python
|
||||
*Just print out normalized text*
|
||||
```python
|
||||
from charset_normalizer import from_path
|
||||
|
||||
results = from_path('./my_subtitle.srt')
|
||||
|
||||
print(str(results.best()))
|
||||
```
|
||||
|
||||
*Normalize any text file*
|
||||
```python
|
||||
from charset_normalizer import normalize
|
||||
try:
|
||||
normalize('./my_subtitle.srt') # should write to disk my_subtitle-***.srt
|
||||
except IOError as e:
|
||||
print('Sadly, we are unable to perform charset normalization.', str(e))
|
||||
```
|
||||
|
||||
*Upgrade your code without effort*
|
||||
```python
|
||||
from charset_normalizer import detect
|
||||
```
|
||||
|
||||
The above code will behave the same as **chardet**. We ensure that we offer the best (reasonable) BC result possible.
|
||||
|
||||
See the docs for advanced usage : [readthedocs.io](https://charset-normalizer.readthedocs.io/en/latest/)
|
||||
|
||||
## 😇 Why
|
||||
|
||||
When I started using Chardet, I noticed that it was not suited to my expectations, and I wanted to propose a
|
||||
reliable alternative using a completely different method. Also! I never back down on a good challenge!
|
||||
|
||||
I **don't care** about the **originating charset** encoding, because **two different tables** can
|
||||
produce **two identical rendered string.**
|
||||
What I want is to get readable text, the best I can.
|
||||
|
||||
In a way, **I'm brute forcing text decoding.** How cool is that ? 😎
|
||||
|
||||
Don't confuse package **ftfy** with charset-normalizer or chardet. ftfy goal is to repair unicode string whereas charset-normalizer to convert raw file in unknown encoding to unicode.
|
||||
|
||||
## 🍰 How
|
||||
|
||||
- Discard all charset encoding table that could not fit the binary content.
|
||||
- Measure chaos, or the mess once opened (by chunks) with a corresponding charset encoding.
|
||||
- Extract matches with the lowest mess detected.
|
||||
- Additionally, we measure coherence / probe for a language.
|
||||
|
||||
**Wait a minute**, what is chaos/mess and coherence according to **YOU ?**
|
||||
|
||||
*Chaos :* I opened hundred of text files, **written by humans**, with the wrong encoding table. **I observed**, then
|
||||
**I established** some ground rules about **what is obvious** when **it seems like** a mess.
|
||||
I know that my interpretation of what is chaotic is very subjective, feel free to contribute in order to
|
||||
improve or rewrite it.
|
||||
|
||||
*Coherence :* For each language there is on earth, we have computed ranked letter appearance occurrences (the best we can). So I thought
|
||||
that intel is worth something here. So I use those records against decoded text to check if I can detect intelligent design.
|
||||
|
||||
## ⚡ Known limitations
|
||||
|
||||
- Language detection is unreliable when text contains two or more languages sharing identical letters. (eg. HTML (english tags) + Turkish content (Sharing Latin characters))
|
||||
- Every charset detector heavily depends on sufficient content. In common cases, do not bother run detection on very tiny content.
|
||||
|
||||
## 👤 Contributing
|
||||
|
||||
Contributions, issues and feature requests are very much welcome.<br />
|
||||
Feel free to check [issues page](https://github.com/ousret/charset_normalizer/issues) if you want to contribute.
|
||||
|
||||
## 📝 License
|
||||
|
||||
Copyright © 2019 [Ahmed TAHRI @Ousret](https://github.com/Ousret).<br />
|
||||
This project is [MIT](https://github.com/Ousret/charset_normalizer/blob/master/LICENSE) licensed.
|
||||
|
||||
Characters frequencies used in this project © 2012 [Denny Vrandečić](http://simia.net/letters/)
|
||||
33
_vendor/charset_normalizer-2.1.1.dist-info/RECORD
Normal file
33
_vendor/charset_normalizer-2.1.1.dist-info/RECORD
Normal file
@@ -0,0 +1,33 @@
|
||||
../../bin/normalizer,sha256=qSpvGsyLwjZW3uUUIySX3JlQOmSuDLIwdxOnC53iQiU,243
|
||||
charset_normalizer-2.1.1.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
|
||||
charset_normalizer-2.1.1.dist-info/LICENSE,sha256=6zGgxaT7Cbik4yBV0lweX5w1iidS_vPNcgIT0cz-4kE,1070
|
||||
charset_normalizer-2.1.1.dist-info/METADATA,sha256=C99l12g4d1E9_UiW-mqPCWx7v2M_lYGWxy1GTOjXSsA,11942
|
||||
charset_normalizer-2.1.1.dist-info/RECORD,,
|
||||
charset_normalizer-2.1.1.dist-info/WHEEL,sha256=G16H4A3IeoQmnOrYV4ueZGKSjhipXx8zc8nu9FGlvMA,92
|
||||
charset_normalizer-2.1.1.dist-info/entry_points.txt,sha256=uYo8aIGLWv8YgWfSna5HnfY_En4pkF1w4bgawNAXzP0,76
|
||||
charset_normalizer-2.1.1.dist-info/top_level.txt,sha256=7ASyzePr8_xuZWJsnqJjIBtyV8vhEo0wBCv1MPRRi3Q,19
|
||||
charset_normalizer/__init__.py,sha256=jGhhf1IcOgCpZsr593E9fPvjWKnflVqHe_LwkOJjInU,1790
|
||||
charset_normalizer/__pycache__/__init__.cpython-310.pyc,,
|
||||
charset_normalizer/__pycache__/api.cpython-310.pyc,,
|
||||
charset_normalizer/__pycache__/cd.cpython-310.pyc,,
|
||||
charset_normalizer/__pycache__/constant.cpython-310.pyc,,
|
||||
charset_normalizer/__pycache__/legacy.cpython-310.pyc,,
|
||||
charset_normalizer/__pycache__/md.cpython-310.pyc,,
|
||||
charset_normalizer/__pycache__/models.cpython-310.pyc,,
|
||||
charset_normalizer/__pycache__/utils.cpython-310.pyc,,
|
||||
charset_normalizer/__pycache__/version.cpython-310.pyc,,
|
||||
charset_normalizer/api.py,sha256=euVPmjAMbjpqhEHPjfKtyy1mK52U0TOUBUQgM_Qy6eE,19191
|
||||
charset_normalizer/assets/__init__.py,sha256=r7aakPaRIc2FFG2mw2V8NOTvkl25_euKZ3wPf5SAVa4,15222
|
||||
charset_normalizer/assets/__pycache__/__init__.cpython-310.pyc,,
|
||||
charset_normalizer/cd.py,sha256=Pxdkbn4cy0iZF42KTb1FiWIqqKobuz_fDjGwc6JMNBc,10811
|
||||
charset_normalizer/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
||||
charset_normalizer/cli/__pycache__/__init__.cpython-310.pyc,,
|
||||
charset_normalizer/cli/__pycache__/normalizer.cpython-310.pyc,,
|
||||
charset_normalizer/cli/normalizer.py,sha256=FmD1RXeMpRBg_mjR0MaJhNUpM2qZ8wz2neAE7AayBeg,9521
|
||||
charset_normalizer/constant.py,sha256=NgU-pY8JH2a9lkVT8oKwAFmIUYNKOuSBwZgF9MrlNCM,19157
|
||||
charset_normalizer/legacy.py,sha256=XKeZOts_HdYQU_Jb3C9ZfOjY2CiUL132k9_nXer8gig,3384
|
||||
charset_normalizer/md.py,sha256=pZP8IVpSC82D8INA9Tf_y0ijJSRI-UIncZvLdfTWEd4,17642
|
||||
charset_normalizer/models.py,sha256=i68YdlSLTEI3EEBVXq8TLNAbyyjrLC2OWszc-OBAk9I,13167
|
||||
charset_normalizer/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
||||
charset_normalizer/utils.py,sha256=ykOznhcAeH-ODLBWJuI7t1nbwa1SAfN_bDYTCJGyh4U,11771
|
||||
charset_normalizer/version.py,sha256=_eh2MA3qS__IajlePQxKBmlw6zaBDvPYlLdEgxgIojw,79
|
||||
5
_vendor/charset_normalizer-2.1.1.dist-info/WHEEL
Normal file
5
_vendor/charset_normalizer-2.1.1.dist-info/WHEEL
Normal file
@@ -0,0 +1,5 @@
|
||||
Wheel-Version: 1.0
|
||||
Generator: bdist_wheel (0.37.1)
|
||||
Root-Is-Purelib: true
|
||||
Tag: py3-none-any
|
||||
|
||||
@@ -0,0 +1,2 @@
|
||||
[console_scripts]
|
||||
normalizer = charset_normalizer.cli.normalizer:cli_detect
|
||||
1
_vendor/charset_normalizer-2.1.1.dist-info/top_level.txt
Normal file
1
_vendor/charset_normalizer-2.1.1.dist-info/top_level.txt
Normal file
@@ -0,0 +1 @@
|
||||
charset_normalizer
|
||||
56
_vendor/charset_normalizer/__init__.py
Normal file
56
_vendor/charset_normalizer/__init__.py
Normal file
@@ -0,0 +1,56 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Charset-Normalizer
|
||||
~~~~~~~~~~~~~~
|
||||
The Real First Universal Charset Detector.
|
||||
A library that helps you read text from an unknown charset encoding.
|
||||
Motivated by chardet, This package is trying to resolve the issue by taking a new approach.
|
||||
All IANA character set names for which the Python core library provides codecs are supported.
|
||||
|
||||
Basic usage:
|
||||
>>> from charset_normalizer import from_bytes
|
||||
>>> results = from_bytes('Bсеки човек има право на образование. Oбразованието!'.encode('utf_8'))
|
||||
>>> best_guess = results.best()
|
||||
>>> str(best_guess)
|
||||
'Bсеки човек има право на образование. Oбразованието!'
|
||||
|
||||
Others methods and usages are available - see the full documentation
|
||||
at <https://github.com/Ousret/charset_normalizer>.
|
||||
:copyright: (c) 2021 by Ahmed TAHRI
|
||||
:license: MIT, see LICENSE for more details.
|
||||
"""
|
||||
import logging
|
||||
|
||||
from .api import from_bytes, from_fp, from_path, normalize
|
||||
from .legacy import (
|
||||
CharsetDetector,
|
||||
CharsetDoctor,
|
||||
CharsetNormalizerMatch,
|
||||
CharsetNormalizerMatches,
|
||||
detect,
|
||||
)
|
||||
from .models import CharsetMatch, CharsetMatches
|
||||
from .utils import set_logging_handler
|
||||
from .version import VERSION, __version__
|
||||
|
||||
__all__ = (
|
||||
"from_fp",
|
||||
"from_path",
|
||||
"from_bytes",
|
||||
"normalize",
|
||||
"detect",
|
||||
"CharsetMatch",
|
||||
"CharsetMatches",
|
||||
"CharsetNormalizerMatch",
|
||||
"CharsetNormalizerMatches",
|
||||
"CharsetDetector",
|
||||
"CharsetDoctor",
|
||||
"__version__",
|
||||
"VERSION",
|
||||
"set_logging_handler",
|
||||
)
|
||||
|
||||
# Attach a NullHandler to the top level logger by default
|
||||
# https://docs.python.org/3.3/howto/logging.html#configuring-logging-for-a-library
|
||||
|
||||
logging.getLogger("charset_normalizer").addHandler(logging.NullHandler())
|
||||
BIN
_vendor/charset_normalizer/__pycache__/__init__.cpython-310.pyc
Normal file
BIN
_vendor/charset_normalizer/__pycache__/__init__.cpython-310.pyc
Normal file
Binary file not shown.
BIN
_vendor/charset_normalizer/__pycache__/api.cpython-310.pyc
Normal file
BIN
_vendor/charset_normalizer/__pycache__/api.cpython-310.pyc
Normal file
Binary file not shown.
BIN
_vendor/charset_normalizer/__pycache__/cd.cpython-310.pyc
Normal file
BIN
_vendor/charset_normalizer/__pycache__/cd.cpython-310.pyc
Normal file
Binary file not shown.
BIN
_vendor/charset_normalizer/__pycache__/constant.cpython-310.pyc
Normal file
BIN
_vendor/charset_normalizer/__pycache__/constant.cpython-310.pyc
Normal file
Binary file not shown.
BIN
_vendor/charset_normalizer/__pycache__/legacy.cpython-310.pyc
Normal file
BIN
_vendor/charset_normalizer/__pycache__/legacy.cpython-310.pyc
Normal file
Binary file not shown.
BIN
_vendor/charset_normalizer/__pycache__/md.cpython-310.pyc
Normal file
BIN
_vendor/charset_normalizer/__pycache__/md.cpython-310.pyc
Normal file
Binary file not shown.
BIN
_vendor/charset_normalizer/__pycache__/models.cpython-310.pyc
Normal file
BIN
_vendor/charset_normalizer/__pycache__/models.cpython-310.pyc
Normal file
Binary file not shown.
BIN
_vendor/charset_normalizer/__pycache__/utils.cpython-310.pyc
Normal file
BIN
_vendor/charset_normalizer/__pycache__/utils.cpython-310.pyc
Normal file
Binary file not shown.
BIN
_vendor/charset_normalizer/__pycache__/version.cpython-310.pyc
Normal file
BIN
_vendor/charset_normalizer/__pycache__/version.cpython-310.pyc
Normal file
Binary file not shown.
584
_vendor/charset_normalizer/api.py
Normal file
584
_vendor/charset_normalizer/api.py
Normal file
@@ -0,0 +1,584 @@
|
||||
import logging
|
||||
import warnings
|
||||
from os import PathLike
|
||||
from os.path import basename, splitext
|
||||
from typing import Any, BinaryIO, List, Optional, Set
|
||||
|
||||
from .cd import (
|
||||
coherence_ratio,
|
||||
encoding_languages,
|
||||
mb_encoding_languages,
|
||||
merge_coherence_ratios,
|
||||
)
|
||||
from .constant import IANA_SUPPORTED, TOO_BIG_SEQUENCE, TOO_SMALL_SEQUENCE, TRACE
|
||||
from .md import mess_ratio
|
||||
from .models import CharsetMatch, CharsetMatches
|
||||
from .utils import (
|
||||
any_specified_encoding,
|
||||
cut_sequence_chunks,
|
||||
iana_name,
|
||||
identify_sig_or_bom,
|
||||
is_cp_similar,
|
||||
is_multi_byte_encoding,
|
||||
should_strip_sig_or_bom,
|
||||
)
|
||||
|
||||
# Will most likely be controversial
|
||||
# logging.addLevelName(TRACE, "TRACE")
|
||||
logger = logging.getLogger("charset_normalizer")
|
||||
explain_handler = logging.StreamHandler()
|
||||
explain_handler.setFormatter(
|
||||
logging.Formatter("%(asctime)s | %(levelname)s | %(message)s")
|
||||
)
|
||||
|
||||
|
||||
def from_bytes(
|
||||
sequences: bytes,
|
||||
steps: int = 5,
|
||||
chunk_size: int = 512,
|
||||
threshold: float = 0.2,
|
||||
cp_isolation: Optional[List[str]] = None,
|
||||
cp_exclusion: Optional[List[str]] = None,
|
||||
preemptive_behaviour: bool = True,
|
||||
explain: bool = False,
|
||||
) -> CharsetMatches:
|
||||
"""
|
||||
Given a raw bytes sequence, return the best possibles charset usable to render str objects.
|
||||
If there is no results, it is a strong indicator that the source is binary/not text.
|
||||
By default, the process will extract 5 blocs of 512o each to assess the mess and coherence of a given sequence.
|
||||
And will give up a particular code page after 20% of measured mess. Those criteria are customizable at will.
|
||||
|
||||
The preemptive behavior DOES NOT replace the traditional detection workflow, it prioritize a particular code page
|
||||
but never take it for granted. Can improve the performance.
|
||||
|
||||
You may want to focus your attention to some code page or/and not others, use cp_isolation and cp_exclusion for that
|
||||
purpose.
|
||||
|
||||
This function will strip the SIG in the payload/sequence every time except on UTF-16, UTF-32.
|
||||
By default the library does not setup any handler other than the NullHandler, if you choose to set the 'explain'
|
||||
toggle to True it will alter the logger configuration to add a StreamHandler that is suitable for debugging.
|
||||
Custom logging format and handler can be set manually.
|
||||
"""
|
||||
|
||||
if not isinstance(sequences, (bytearray, bytes)):
|
||||
raise TypeError(
|
||||
"Expected object of type bytes or bytearray, got: {0}".format(
|
||||
type(sequences)
|
||||
)
|
||||
)
|
||||
|
||||
if explain:
|
||||
previous_logger_level: int = logger.level
|
||||
logger.addHandler(explain_handler)
|
||||
logger.setLevel(TRACE)
|
||||
|
||||
length: int = len(sequences)
|
||||
|
||||
if length == 0:
|
||||
logger.debug("Encoding detection on empty bytes, assuming utf_8 intention.")
|
||||
if explain:
|
||||
logger.removeHandler(explain_handler)
|
||||
logger.setLevel(previous_logger_level or logging.WARNING)
|
||||
return CharsetMatches([CharsetMatch(sequences, "utf_8", 0.0, False, [], "")])
|
||||
|
||||
if cp_isolation is not None:
|
||||
logger.log(
|
||||
TRACE,
|
||||
"cp_isolation is set. use this flag for debugging purpose. "
|
||||
"limited list of encoding allowed : %s.",
|
||||
", ".join(cp_isolation),
|
||||
)
|
||||
cp_isolation = [iana_name(cp, False) for cp in cp_isolation]
|
||||
else:
|
||||
cp_isolation = []
|
||||
|
||||
if cp_exclusion is not None:
|
||||
logger.log(
|
||||
TRACE,
|
||||
"cp_exclusion is set. use this flag for debugging purpose. "
|
||||
"limited list of encoding excluded : %s.",
|
||||
", ".join(cp_exclusion),
|
||||
)
|
||||
cp_exclusion = [iana_name(cp, False) for cp in cp_exclusion]
|
||||
else:
|
||||
cp_exclusion = []
|
||||
|
||||
if length <= (chunk_size * steps):
|
||||
logger.log(
|
||||
TRACE,
|
||||
"override steps (%i) and chunk_size (%i) as content does not fit (%i byte(s) given) parameters.",
|
||||
steps,
|
||||
chunk_size,
|
||||
length,
|
||||
)
|
||||
steps = 1
|
||||
chunk_size = length
|
||||
|
||||
if steps > 1 and length / steps < chunk_size:
|
||||
chunk_size = int(length / steps)
|
||||
|
||||
is_too_small_sequence: bool = len(sequences) < TOO_SMALL_SEQUENCE
|
||||
is_too_large_sequence: bool = len(sequences) >= TOO_BIG_SEQUENCE
|
||||
|
||||
if is_too_small_sequence:
|
||||
logger.log(
|
||||
TRACE,
|
||||
"Trying to detect encoding from a tiny portion of ({}) byte(s).".format(
|
||||
length
|
||||
),
|
||||
)
|
||||
elif is_too_large_sequence:
|
||||
logger.log(
|
||||
TRACE,
|
||||
"Using lazy str decoding because the payload is quite large, ({}) byte(s).".format(
|
||||
length
|
||||
),
|
||||
)
|
||||
|
||||
prioritized_encodings: List[str] = []
|
||||
|
||||
specified_encoding: Optional[str] = (
|
||||
any_specified_encoding(sequences) if preemptive_behaviour else None
|
||||
)
|
||||
|
||||
if specified_encoding is not None:
|
||||
prioritized_encodings.append(specified_encoding)
|
||||
logger.log(
|
||||
TRACE,
|
||||
"Detected declarative mark in sequence. Priority +1 given for %s.",
|
||||
specified_encoding,
|
||||
)
|
||||
|
||||
tested: Set[str] = set()
|
||||
tested_but_hard_failure: List[str] = []
|
||||
tested_but_soft_failure: List[str] = []
|
||||
|
||||
fallback_ascii: Optional[CharsetMatch] = None
|
||||
fallback_u8: Optional[CharsetMatch] = None
|
||||
fallback_specified: Optional[CharsetMatch] = None
|
||||
|
||||
results: CharsetMatches = CharsetMatches()
|
||||
|
||||
sig_encoding, sig_payload = identify_sig_or_bom(sequences)
|
||||
|
||||
if sig_encoding is not None:
|
||||
prioritized_encodings.append(sig_encoding)
|
||||
logger.log(
|
||||
TRACE,
|
||||
"Detected a SIG or BOM mark on first %i byte(s). Priority +1 given for %s.",
|
||||
len(sig_payload),
|
||||
sig_encoding,
|
||||
)
|
||||
|
||||
prioritized_encodings.append("ascii")
|
||||
|
||||
if "utf_8" not in prioritized_encodings:
|
||||
prioritized_encodings.append("utf_8")
|
||||
|
||||
for encoding_iana in prioritized_encodings + IANA_SUPPORTED:
|
||||
|
||||
if cp_isolation and encoding_iana not in cp_isolation:
|
||||
continue
|
||||
|
||||
if cp_exclusion and encoding_iana in cp_exclusion:
|
||||
continue
|
||||
|
||||
if encoding_iana in tested:
|
||||
continue
|
||||
|
||||
tested.add(encoding_iana)
|
||||
|
||||
decoded_payload: Optional[str] = None
|
||||
bom_or_sig_available: bool = sig_encoding == encoding_iana
|
||||
strip_sig_or_bom: bool = bom_or_sig_available and should_strip_sig_or_bom(
|
||||
encoding_iana
|
||||
)
|
||||
|
||||
if encoding_iana in {"utf_16", "utf_32"} and not bom_or_sig_available:
|
||||
logger.log(
|
||||
TRACE,
|
||||
"Encoding %s wont be tested as-is because it require a BOM. Will try some sub-encoder LE/BE.",
|
||||
encoding_iana,
|
||||
)
|
||||
continue
|
||||
|
||||
try:
|
||||
is_multi_byte_decoder: bool = is_multi_byte_encoding(encoding_iana)
|
||||
except (ModuleNotFoundError, ImportError):
|
||||
logger.log(
|
||||
TRACE,
|
||||
"Encoding %s does not provide an IncrementalDecoder",
|
||||
encoding_iana,
|
||||
)
|
||||
continue
|
||||
|
||||
try:
|
||||
if is_too_large_sequence and is_multi_byte_decoder is False:
|
||||
str(
|
||||
sequences[: int(50e4)]
|
||||
if strip_sig_or_bom is False
|
||||
else sequences[len(sig_payload) : int(50e4)],
|
||||
encoding=encoding_iana,
|
||||
)
|
||||
else:
|
||||
decoded_payload = str(
|
||||
sequences
|
||||
if strip_sig_or_bom is False
|
||||
else sequences[len(sig_payload) :],
|
||||
encoding=encoding_iana,
|
||||
)
|
||||
except (UnicodeDecodeError, LookupError) as e:
|
||||
if not isinstance(e, LookupError):
|
||||
logger.log(
|
||||
TRACE,
|
||||
"Code page %s does not fit given bytes sequence at ALL. %s",
|
||||
encoding_iana,
|
||||
str(e),
|
||||
)
|
||||
tested_but_hard_failure.append(encoding_iana)
|
||||
continue
|
||||
|
||||
similar_soft_failure_test: bool = False
|
||||
|
||||
for encoding_soft_failed in tested_but_soft_failure:
|
||||
if is_cp_similar(encoding_iana, encoding_soft_failed):
|
||||
similar_soft_failure_test = True
|
||||
break
|
||||
|
||||
if similar_soft_failure_test:
|
||||
logger.log(
|
||||
TRACE,
|
||||
"%s is deemed too similar to code page %s and was consider unsuited already. Continuing!",
|
||||
encoding_iana,
|
||||
encoding_soft_failed,
|
||||
)
|
||||
continue
|
||||
|
||||
r_ = range(
|
||||
0 if not bom_or_sig_available else len(sig_payload),
|
||||
length,
|
||||
int(length / steps),
|
||||
)
|
||||
|
||||
multi_byte_bonus: bool = (
|
||||
is_multi_byte_decoder
|
||||
and decoded_payload is not None
|
||||
and len(decoded_payload) < length
|
||||
)
|
||||
|
||||
if multi_byte_bonus:
|
||||
logger.log(
|
||||
TRACE,
|
||||
"Code page %s is a multi byte encoding table and it appear that at least one character "
|
||||
"was encoded using n-bytes.",
|
||||
encoding_iana,
|
||||
)
|
||||
|
||||
max_chunk_gave_up: int = int(len(r_) / 4)
|
||||
|
||||
max_chunk_gave_up = max(max_chunk_gave_up, 2)
|
||||
early_stop_count: int = 0
|
||||
lazy_str_hard_failure = False
|
||||
|
||||
md_chunks: List[str] = []
|
||||
md_ratios = []
|
||||
|
||||
try:
|
||||
for chunk in cut_sequence_chunks(
|
||||
sequences,
|
||||
encoding_iana,
|
||||
r_,
|
||||
chunk_size,
|
||||
bom_or_sig_available,
|
||||
strip_sig_or_bom,
|
||||
sig_payload,
|
||||
is_multi_byte_decoder,
|
||||
decoded_payload,
|
||||
):
|
||||
md_chunks.append(chunk)
|
||||
|
||||
md_ratios.append(mess_ratio(chunk, threshold))
|
||||
|
||||
if md_ratios[-1] >= threshold:
|
||||
early_stop_count += 1
|
||||
|
||||
if (early_stop_count >= max_chunk_gave_up) or (
|
||||
bom_or_sig_available and strip_sig_or_bom is False
|
||||
):
|
||||
break
|
||||
except UnicodeDecodeError as e: # Lazy str loading may have missed something there
|
||||
logger.log(
|
||||
TRACE,
|
||||
"LazyStr Loading: After MD chunk decode, code page %s does not fit given bytes sequence at ALL. %s",
|
||||
encoding_iana,
|
||||
str(e),
|
||||
)
|
||||
early_stop_count = max_chunk_gave_up
|
||||
lazy_str_hard_failure = True
|
||||
|
||||
# We might want to check the sequence again with the whole content
|
||||
# Only if initial MD tests passes
|
||||
if (
|
||||
not lazy_str_hard_failure
|
||||
and is_too_large_sequence
|
||||
and not is_multi_byte_decoder
|
||||
):
|
||||
try:
|
||||
sequences[int(50e3) :].decode(encoding_iana, errors="strict")
|
||||
except UnicodeDecodeError as e:
|
||||
logger.log(
|
||||
TRACE,
|
||||
"LazyStr Loading: After final lookup, code page %s does not fit given bytes sequence at ALL. %s",
|
||||
encoding_iana,
|
||||
str(e),
|
||||
)
|
||||
tested_but_hard_failure.append(encoding_iana)
|
||||
continue
|
||||
|
||||
mean_mess_ratio: float = sum(md_ratios) / len(md_ratios) if md_ratios else 0.0
|
||||
if mean_mess_ratio >= threshold or early_stop_count >= max_chunk_gave_up:
|
||||
tested_but_soft_failure.append(encoding_iana)
|
||||
logger.log(
|
||||
TRACE,
|
||||
"%s was excluded because of initial chaos probing. Gave up %i time(s). "
|
||||
"Computed mean chaos is %f %%.",
|
||||
encoding_iana,
|
||||
early_stop_count,
|
||||
round(mean_mess_ratio * 100, ndigits=3),
|
||||
)
|
||||
# Preparing those fallbacks in case we got nothing.
|
||||
if (
|
||||
encoding_iana in ["ascii", "utf_8", specified_encoding]
|
||||
and not lazy_str_hard_failure
|
||||
):
|
||||
fallback_entry = CharsetMatch(
|
||||
sequences, encoding_iana, threshold, False, [], decoded_payload
|
||||
)
|
||||
if encoding_iana == specified_encoding:
|
||||
fallback_specified = fallback_entry
|
||||
elif encoding_iana == "ascii":
|
||||
fallback_ascii = fallback_entry
|
||||
else:
|
||||
fallback_u8 = fallback_entry
|
||||
continue
|
||||
|
||||
logger.log(
|
||||
TRACE,
|
||||
"%s passed initial chaos probing. Mean measured chaos is %f %%",
|
||||
encoding_iana,
|
||||
round(mean_mess_ratio * 100, ndigits=3),
|
||||
)
|
||||
|
||||
if not is_multi_byte_decoder:
|
||||
target_languages: List[str] = encoding_languages(encoding_iana)
|
||||
else:
|
||||
target_languages = mb_encoding_languages(encoding_iana)
|
||||
|
||||
if target_languages:
|
||||
logger.log(
|
||||
TRACE,
|
||||
"{} should target any language(s) of {}".format(
|
||||
encoding_iana, str(target_languages)
|
||||
),
|
||||
)
|
||||
|
||||
cd_ratios = []
|
||||
|
||||
# We shall skip the CD when its about ASCII
|
||||
# Most of the time its not relevant to run "language-detection" on it.
|
||||
if encoding_iana != "ascii":
|
||||
for chunk in md_chunks:
|
||||
chunk_languages = coherence_ratio(
|
||||
chunk, 0.1, ",".join(target_languages) if target_languages else None
|
||||
)
|
||||
|
||||
cd_ratios.append(chunk_languages)
|
||||
|
||||
cd_ratios_merged = merge_coherence_ratios(cd_ratios)
|
||||
|
||||
if cd_ratios_merged:
|
||||
logger.log(
|
||||
TRACE,
|
||||
"We detected language {} using {}".format(
|
||||
cd_ratios_merged, encoding_iana
|
||||
),
|
||||
)
|
||||
|
||||
results.append(
|
||||
CharsetMatch(
|
||||
sequences,
|
||||
encoding_iana,
|
||||
mean_mess_ratio,
|
||||
bom_or_sig_available,
|
||||
cd_ratios_merged,
|
||||
decoded_payload,
|
||||
)
|
||||
)
|
||||
|
||||
if (
|
||||
encoding_iana in [specified_encoding, "ascii", "utf_8"]
|
||||
and mean_mess_ratio < 0.1
|
||||
):
|
||||
logger.debug(
|
||||
"Encoding detection: %s is most likely the one.", encoding_iana
|
||||
)
|
||||
if explain:
|
||||
logger.removeHandler(explain_handler)
|
||||
logger.setLevel(previous_logger_level)
|
||||
return CharsetMatches([results[encoding_iana]])
|
||||
|
||||
if encoding_iana == sig_encoding:
|
||||
logger.debug(
|
||||
"Encoding detection: %s is most likely the one as we detected a BOM or SIG within "
|
||||
"the beginning of the sequence.",
|
||||
encoding_iana,
|
||||
)
|
||||
if explain:
|
||||
logger.removeHandler(explain_handler)
|
||||
logger.setLevel(previous_logger_level)
|
||||
return CharsetMatches([results[encoding_iana]])
|
||||
|
||||
if len(results) == 0:
|
||||
if fallback_u8 or fallback_ascii or fallback_specified:
|
||||
logger.log(
|
||||
TRACE,
|
||||
"Nothing got out of the detection process. Using ASCII/UTF-8/Specified fallback.",
|
||||
)
|
||||
|
||||
if fallback_specified:
|
||||
logger.debug(
|
||||
"Encoding detection: %s will be used as a fallback match",
|
||||
fallback_specified.encoding,
|
||||
)
|
||||
results.append(fallback_specified)
|
||||
elif (
|
||||
(fallback_u8 and fallback_ascii is None)
|
||||
or (
|
||||
fallback_u8
|
||||
and fallback_ascii
|
||||
and fallback_u8.fingerprint != fallback_ascii.fingerprint
|
||||
)
|
||||
or (fallback_u8 is not None)
|
||||
):
|
||||
logger.debug("Encoding detection: utf_8 will be used as a fallback match")
|
||||
results.append(fallback_u8)
|
||||
elif fallback_ascii:
|
||||
logger.debug("Encoding detection: ascii will be used as a fallback match")
|
||||
results.append(fallback_ascii)
|
||||
|
||||
if results:
|
||||
logger.debug(
|
||||
"Encoding detection: Found %s as plausible (best-candidate) for content. With %i alternatives.",
|
||||
results.best().encoding, # type: ignore
|
||||
len(results) - 1,
|
||||
)
|
||||
else:
|
||||
logger.debug("Encoding detection: Unable to determine any suitable charset.")
|
||||
|
||||
if explain:
|
||||
logger.removeHandler(explain_handler)
|
||||
logger.setLevel(previous_logger_level)
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def from_fp(
|
||||
fp: BinaryIO,
|
||||
steps: int = 5,
|
||||
chunk_size: int = 512,
|
||||
threshold: float = 0.20,
|
||||
cp_isolation: Optional[List[str]] = None,
|
||||
cp_exclusion: Optional[List[str]] = None,
|
||||
preemptive_behaviour: bool = True,
|
||||
explain: bool = False,
|
||||
) -> CharsetMatches:
|
||||
"""
|
||||
Same thing than the function from_bytes but using a file pointer that is already ready.
|
||||
Will not close the file pointer.
|
||||
"""
|
||||
return from_bytes(
|
||||
fp.read(),
|
||||
steps,
|
||||
chunk_size,
|
||||
threshold,
|
||||
cp_isolation,
|
||||
cp_exclusion,
|
||||
preemptive_behaviour,
|
||||
explain,
|
||||
)
|
||||
|
||||
|
||||
def from_path(
|
||||
path: "PathLike[Any]",
|
||||
steps: int = 5,
|
||||
chunk_size: int = 512,
|
||||
threshold: float = 0.20,
|
||||
cp_isolation: Optional[List[str]] = None,
|
||||
cp_exclusion: Optional[List[str]] = None,
|
||||
preemptive_behaviour: bool = True,
|
||||
explain: bool = False,
|
||||
) -> CharsetMatches:
|
||||
"""
|
||||
Same thing than the function from_bytes but with one extra step. Opening and reading given file path in binary mode.
|
||||
Can raise IOError.
|
||||
"""
|
||||
with open(path, "rb") as fp:
|
||||
return from_fp(
|
||||
fp,
|
||||
steps,
|
||||
chunk_size,
|
||||
threshold,
|
||||
cp_isolation,
|
||||
cp_exclusion,
|
||||
preemptive_behaviour,
|
||||
explain,
|
||||
)
|
||||
|
||||
|
||||
def normalize(
|
||||
path: "PathLike[Any]",
|
||||
steps: int = 5,
|
||||
chunk_size: int = 512,
|
||||
threshold: float = 0.20,
|
||||
cp_isolation: Optional[List[str]] = None,
|
||||
cp_exclusion: Optional[List[str]] = None,
|
||||
preemptive_behaviour: bool = True,
|
||||
) -> CharsetMatch:
|
||||
"""
|
||||
Take a (text-based) file path and try to create another file next to it, this time using UTF-8.
|
||||
"""
|
||||
warnings.warn(
|
||||
"normalize is deprecated and will be removed in 3.0",
|
||||
DeprecationWarning,
|
||||
)
|
||||
|
||||
results = from_path(
|
||||
path,
|
||||
steps,
|
||||
chunk_size,
|
||||
threshold,
|
||||
cp_isolation,
|
||||
cp_exclusion,
|
||||
preemptive_behaviour,
|
||||
)
|
||||
|
||||
filename = basename(path)
|
||||
target_extensions = list(splitext(filename))
|
||||
|
||||
if len(results) == 0:
|
||||
raise IOError(
|
||||
'Unable to normalize "{}", no encoding charset seems to fit.'.format(
|
||||
filename
|
||||
)
|
||||
)
|
||||
|
||||
result = results.best()
|
||||
|
||||
target_extensions[0] += "-" + result.encoding # type: ignore
|
||||
|
||||
with open(
|
||||
"{}".format(str(path).replace(filename, "".join(target_extensions))), "wb"
|
||||
) as fp:
|
||||
fp.write(result.output()) # type: ignore
|
||||
|
||||
return result # type: ignore
|
||||
1122
_vendor/charset_normalizer/assets/__init__.py
Normal file
1122
_vendor/charset_normalizer/assets/__init__.py
Normal file
File diff suppressed because it is too large
Load Diff
Binary file not shown.
339
_vendor/charset_normalizer/cd.py
Normal file
339
_vendor/charset_normalizer/cd.py
Normal file
@@ -0,0 +1,339 @@
|
||||
import importlib
|
||||
from codecs import IncrementalDecoder
|
||||
from collections import Counter
|
||||
from functools import lru_cache
|
||||
from typing import Counter as TypeCounter, Dict, List, Optional, Tuple
|
||||
|
||||
from .assets import FREQUENCIES
|
||||
from .constant import KO_NAMES, LANGUAGE_SUPPORTED_COUNT, TOO_SMALL_SEQUENCE, ZH_NAMES
|
||||
from .md import is_suspiciously_successive_range
|
||||
from .models import CoherenceMatches
|
||||
from .utils import (
|
||||
is_accentuated,
|
||||
is_latin,
|
||||
is_multi_byte_encoding,
|
||||
is_unicode_range_secondary,
|
||||
unicode_range,
|
||||
)
|
||||
|
||||
|
||||
def encoding_unicode_range(iana_name: str) -> List[str]:
|
||||
"""
|
||||
Return associated unicode ranges in a single byte code page.
|
||||
"""
|
||||
if is_multi_byte_encoding(iana_name):
|
||||
raise IOError("Function not supported on multi-byte code page")
|
||||
|
||||
decoder = importlib.import_module(
|
||||
"encodings.{}".format(iana_name)
|
||||
).IncrementalDecoder
|
||||
|
||||
p: IncrementalDecoder = decoder(errors="ignore")
|
||||
seen_ranges: Dict[str, int] = {}
|
||||
character_count: int = 0
|
||||
|
||||
for i in range(0x40, 0xFF):
|
||||
chunk: str = p.decode(bytes([i]))
|
||||
|
||||
if chunk:
|
||||
character_range: Optional[str] = unicode_range(chunk)
|
||||
|
||||
if character_range is None:
|
||||
continue
|
||||
|
||||
if is_unicode_range_secondary(character_range) is False:
|
||||
if character_range not in seen_ranges:
|
||||
seen_ranges[character_range] = 0
|
||||
seen_ranges[character_range] += 1
|
||||
character_count += 1
|
||||
|
||||
return sorted(
|
||||
[
|
||||
character_range
|
||||
for character_range in seen_ranges
|
||||
if seen_ranges[character_range] / character_count >= 0.15
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
def unicode_range_languages(primary_range: str) -> List[str]:
|
||||
"""
|
||||
Return inferred languages used with a unicode range.
|
||||
"""
|
||||
languages: List[str] = []
|
||||
|
||||
for language, characters in FREQUENCIES.items():
|
||||
for character in characters:
|
||||
if unicode_range(character) == primary_range:
|
||||
languages.append(language)
|
||||
break
|
||||
|
||||
return languages
|
||||
|
||||
|
||||
@lru_cache()
|
||||
def encoding_languages(iana_name: str) -> List[str]:
|
||||
"""
|
||||
Single-byte encoding language association. Some code page are heavily linked to particular language(s).
|
||||
This function does the correspondence.
|
||||
"""
|
||||
unicode_ranges: List[str] = encoding_unicode_range(iana_name)
|
||||
primary_range: Optional[str] = None
|
||||
|
||||
for specified_range in unicode_ranges:
|
||||
if "Latin" not in specified_range:
|
||||
primary_range = specified_range
|
||||
break
|
||||
|
||||
if primary_range is None:
|
||||
return ["Latin Based"]
|
||||
|
||||
return unicode_range_languages(primary_range)
|
||||
|
||||
|
||||
@lru_cache()
|
||||
def mb_encoding_languages(iana_name: str) -> List[str]:
|
||||
"""
|
||||
Multi-byte encoding language association. Some code page are heavily linked to particular language(s).
|
||||
This function does the correspondence.
|
||||
"""
|
||||
if (
|
||||
iana_name.startswith("shift_")
|
||||
or iana_name.startswith("iso2022_jp")
|
||||
or iana_name.startswith("euc_j")
|
||||
or iana_name == "cp932"
|
||||
):
|
||||
return ["Japanese"]
|
||||
if iana_name.startswith("gb") or iana_name in ZH_NAMES:
|
||||
return ["Chinese", "Classical Chinese"]
|
||||
if iana_name.startswith("iso2022_kr") or iana_name in KO_NAMES:
|
||||
return ["Korean"]
|
||||
|
||||
return []
|
||||
|
||||
|
||||
@lru_cache(maxsize=LANGUAGE_SUPPORTED_COUNT)
|
||||
def get_target_features(language: str) -> Tuple[bool, bool]:
|
||||
"""
|
||||
Determine main aspects from a supported language if it contains accents and if is pure Latin.
|
||||
"""
|
||||
target_have_accents: bool = False
|
||||
target_pure_latin: bool = True
|
||||
|
||||
for character in FREQUENCIES[language]:
|
||||
if not target_have_accents and is_accentuated(character):
|
||||
target_have_accents = True
|
||||
if target_pure_latin and is_latin(character) is False:
|
||||
target_pure_latin = False
|
||||
|
||||
return target_have_accents, target_pure_latin
|
||||
|
||||
|
||||
def alphabet_languages(
|
||||
characters: List[str], ignore_non_latin: bool = False
|
||||
) -> List[str]:
|
||||
"""
|
||||
Return associated languages associated to given characters.
|
||||
"""
|
||||
languages: List[Tuple[str, float]] = []
|
||||
|
||||
source_have_accents = any(is_accentuated(character) for character in characters)
|
||||
|
||||
for language, language_characters in FREQUENCIES.items():
|
||||
|
||||
target_have_accents, target_pure_latin = get_target_features(language)
|
||||
|
||||
if ignore_non_latin and target_pure_latin is False:
|
||||
continue
|
||||
|
||||
if target_have_accents is False and source_have_accents:
|
||||
continue
|
||||
|
||||
character_count: int = len(language_characters)
|
||||
|
||||
character_match_count: int = len(
|
||||
[c for c in language_characters if c in characters]
|
||||
)
|
||||
|
||||
ratio: float = character_match_count / character_count
|
||||
|
||||
if ratio >= 0.2:
|
||||
languages.append((language, ratio))
|
||||
|
||||
languages = sorted(languages, key=lambda x: x[1], reverse=True)
|
||||
|
||||
return [compatible_language[0] for compatible_language in languages]
|
||||
|
||||
|
||||
def characters_popularity_compare(
|
||||
language: str, ordered_characters: List[str]
|
||||
) -> float:
|
||||
"""
|
||||
Determine if a ordered characters list (by occurrence from most appearance to rarest) match a particular language.
|
||||
The result is a ratio between 0. (absolutely no correspondence) and 1. (near perfect fit).
|
||||
Beware that is function is not strict on the match in order to ease the detection. (Meaning close match is 1.)
|
||||
"""
|
||||
if language not in FREQUENCIES:
|
||||
raise ValueError("{} not available".format(language))
|
||||
|
||||
character_approved_count: int = 0
|
||||
FREQUENCIES_language_set = set(FREQUENCIES[language])
|
||||
|
||||
for character in ordered_characters:
|
||||
if character not in FREQUENCIES_language_set:
|
||||
continue
|
||||
|
||||
characters_before_source: List[str] = FREQUENCIES[language][
|
||||
0 : FREQUENCIES[language].index(character)
|
||||
]
|
||||
characters_after_source: List[str] = FREQUENCIES[language][
|
||||
FREQUENCIES[language].index(character) :
|
||||
]
|
||||
characters_before: List[str] = ordered_characters[
|
||||
0 : ordered_characters.index(character)
|
||||
]
|
||||
characters_after: List[str] = ordered_characters[
|
||||
ordered_characters.index(character) :
|
||||
]
|
||||
|
||||
before_match_count: int = len(
|
||||
set(characters_before) & set(characters_before_source)
|
||||
)
|
||||
|
||||
after_match_count: int = len(
|
||||
set(characters_after) & set(characters_after_source)
|
||||
)
|
||||
|
||||
if len(characters_before_source) == 0 and before_match_count <= 4:
|
||||
character_approved_count += 1
|
||||
continue
|
||||
|
||||
if len(characters_after_source) == 0 and after_match_count <= 4:
|
||||
character_approved_count += 1
|
||||
continue
|
||||
|
||||
if (
|
||||
before_match_count / len(characters_before_source) >= 0.4
|
||||
or after_match_count / len(characters_after_source) >= 0.4
|
||||
):
|
||||
character_approved_count += 1
|
||||
continue
|
||||
|
||||
return character_approved_count / len(ordered_characters)
|
||||
|
||||
|
||||
def alpha_unicode_split(decoded_sequence: str) -> List[str]:
|
||||
"""
|
||||
Given a decoded text sequence, return a list of str. Unicode range / alphabet separation.
|
||||
Ex. a text containing English/Latin with a bit a Hebrew will return two items in the resulting list;
|
||||
One containing the latin letters and the other hebrew.
|
||||
"""
|
||||
layers: Dict[str, str] = {}
|
||||
|
||||
for character in decoded_sequence:
|
||||
if character.isalpha() is False:
|
||||
continue
|
||||
|
||||
character_range: Optional[str] = unicode_range(character)
|
||||
|
||||
if character_range is None:
|
||||
continue
|
||||
|
||||
layer_target_range: Optional[str] = None
|
||||
|
||||
for discovered_range in layers:
|
||||
if (
|
||||
is_suspiciously_successive_range(discovered_range, character_range)
|
||||
is False
|
||||
):
|
||||
layer_target_range = discovered_range
|
||||
break
|
||||
|
||||
if layer_target_range is None:
|
||||
layer_target_range = character_range
|
||||
|
||||
if layer_target_range not in layers:
|
||||
layers[layer_target_range] = character.lower()
|
||||
continue
|
||||
|
||||
layers[layer_target_range] += character.lower()
|
||||
|
||||
return list(layers.values())
|
||||
|
||||
|
||||
def merge_coherence_ratios(results: List[CoherenceMatches]) -> CoherenceMatches:
|
||||
"""
|
||||
This function merge results previously given by the function coherence_ratio.
|
||||
The return type is the same as coherence_ratio.
|
||||
"""
|
||||
per_language_ratios: Dict[str, List[float]] = {}
|
||||
for result in results:
|
||||
for sub_result in result:
|
||||
language, ratio = sub_result
|
||||
if language not in per_language_ratios:
|
||||
per_language_ratios[language] = [ratio]
|
||||
continue
|
||||
per_language_ratios[language].append(ratio)
|
||||
|
||||
merge = [
|
||||
(
|
||||
language,
|
||||
round(
|
||||
sum(per_language_ratios[language]) / len(per_language_ratios[language]),
|
||||
4,
|
||||
),
|
||||
)
|
||||
for language in per_language_ratios
|
||||
]
|
||||
|
||||
return sorted(merge, key=lambda x: x[1], reverse=True)
|
||||
|
||||
|
||||
@lru_cache(maxsize=2048)
|
||||
def coherence_ratio(
|
||||
decoded_sequence: str, threshold: float = 0.1, lg_inclusion: Optional[str] = None
|
||||
) -> CoherenceMatches:
|
||||
"""
|
||||
Detect ANY language that can be identified in given sequence. The sequence will be analysed by layers.
|
||||
A layer = Character extraction by alphabets/ranges.
|
||||
"""
|
||||
|
||||
results: List[Tuple[str, float]] = []
|
||||
ignore_non_latin: bool = False
|
||||
|
||||
sufficient_match_count: int = 0
|
||||
|
||||
lg_inclusion_list = lg_inclusion.split(",") if lg_inclusion is not None else []
|
||||
if "Latin Based" in lg_inclusion_list:
|
||||
ignore_non_latin = True
|
||||
lg_inclusion_list.remove("Latin Based")
|
||||
|
||||
for layer in alpha_unicode_split(decoded_sequence):
|
||||
sequence_frequencies: TypeCounter[str] = Counter(layer)
|
||||
most_common = sequence_frequencies.most_common()
|
||||
|
||||
character_count: int = sum(o for c, o in most_common)
|
||||
|
||||
if character_count <= TOO_SMALL_SEQUENCE:
|
||||
continue
|
||||
|
||||
popular_character_ordered: List[str] = [c for c, o in most_common]
|
||||
|
||||
for language in lg_inclusion_list or alphabet_languages(
|
||||
popular_character_ordered, ignore_non_latin
|
||||
):
|
||||
ratio: float = characters_popularity_compare(
|
||||
language, popular_character_ordered
|
||||
)
|
||||
|
||||
if ratio < threshold:
|
||||
continue
|
||||
elif ratio >= 0.8:
|
||||
sufficient_match_count += 1
|
||||
|
||||
results.append((language, round(ratio, 4)))
|
||||
|
||||
if sufficient_match_count >= 3:
|
||||
break
|
||||
|
||||
return sorted(results, key=lambda x: x[1], reverse=True)
|
||||
0
_vendor/charset_normalizer/cli/__init__.py
Normal file
0
_vendor/charset_normalizer/cli/__init__.py
Normal file
Binary file not shown.
Binary file not shown.
295
_vendor/charset_normalizer/cli/normalizer.py
Normal file
295
_vendor/charset_normalizer/cli/normalizer.py
Normal file
@@ -0,0 +1,295 @@
|
||||
import argparse
|
||||
import sys
|
||||
from json import dumps
|
||||
from os.path import abspath
|
||||
from platform import python_version
|
||||
from typing import List, Optional
|
||||
|
||||
try:
|
||||
from unicodedata2 import unidata_version
|
||||
except ImportError:
|
||||
from unicodedata import unidata_version
|
||||
|
||||
from charset_normalizer import from_fp
|
||||
from charset_normalizer.models import CliDetectionResult
|
||||
from charset_normalizer.version import __version__
|
||||
|
||||
|
||||
def query_yes_no(question: str, default: str = "yes") -> bool:
|
||||
"""Ask a yes/no question via input() and return their answer.
|
||||
|
||||
"question" is a string that is presented to the user.
|
||||
"default" is the presumed answer if the user just hits <Enter>.
|
||||
It must be "yes" (the default), "no" or None (meaning
|
||||
an answer is required of the user).
|
||||
|
||||
The "answer" return value is True for "yes" or False for "no".
|
||||
|
||||
Credit goes to (c) https://stackoverflow.com/questions/3041986/apt-command-line-interface-like-yes-no-input
|
||||
"""
|
||||
valid = {"yes": True, "y": True, "ye": True, "no": False, "n": False}
|
||||
if default is None:
|
||||
prompt = " [y/n] "
|
||||
elif default == "yes":
|
||||
prompt = " [Y/n] "
|
||||
elif default == "no":
|
||||
prompt = " [y/N] "
|
||||
else:
|
||||
raise ValueError("invalid default answer: '%s'" % default)
|
||||
|
||||
while True:
|
||||
sys.stdout.write(question + prompt)
|
||||
choice = input().lower()
|
||||
if default is not None and choice == "":
|
||||
return valid[default]
|
||||
elif choice in valid:
|
||||
return valid[choice]
|
||||
else:
|
||||
sys.stdout.write("Please respond with 'yes' or 'no' " "(or 'y' or 'n').\n")
|
||||
|
||||
|
||||
def cli_detect(argv: Optional[List[str]] = None) -> int:
|
||||
"""
|
||||
CLI assistant using ARGV and ArgumentParser
|
||||
:param argv:
|
||||
:return: 0 if everything is fine, anything else equal trouble
|
||||
"""
|
||||
parser = argparse.ArgumentParser(
|
||||
description="The Real First Universal Charset Detector. "
|
||||
"Discover originating encoding used on text file. "
|
||||
"Normalize text to unicode."
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"files", type=argparse.FileType("rb"), nargs="+", help="File(s) to be analysed"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-v",
|
||||
"--verbose",
|
||||
action="store_true",
|
||||
default=False,
|
||||
dest="verbose",
|
||||
help="Display complementary information about file if any. "
|
||||
"Stdout will contain logs about the detection process.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-a",
|
||||
"--with-alternative",
|
||||
action="store_true",
|
||||
default=False,
|
||||
dest="alternatives",
|
||||
help="Output complementary possibilities if any. Top-level JSON WILL be a list.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-n",
|
||||
"--normalize",
|
||||
action="store_true",
|
||||
default=False,
|
||||
dest="normalize",
|
||||
help="Permit to normalize input file. If not set, program does not write anything.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-m",
|
||||
"--minimal",
|
||||
action="store_true",
|
||||
default=False,
|
||||
dest="minimal",
|
||||
help="Only output the charset detected to STDOUT. Disabling JSON output.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-r",
|
||||
"--replace",
|
||||
action="store_true",
|
||||
default=False,
|
||||
dest="replace",
|
||||
help="Replace file when trying to normalize it instead of creating a new one.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-f",
|
||||
"--force",
|
||||
action="store_true",
|
||||
default=False,
|
||||
dest="force",
|
||||
help="Replace file without asking if you are sure, use this flag with caution.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-t",
|
||||
"--threshold",
|
||||
action="store",
|
||||
default=0.2,
|
||||
type=float,
|
||||
dest="threshold",
|
||||
help="Define a custom maximum amount of chaos allowed in decoded content. 0. <= chaos <= 1.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--version",
|
||||
action="version",
|
||||
version="Charset-Normalizer {} - Python {} - Unicode {}".format(
|
||||
__version__, python_version(), unidata_version
|
||||
),
|
||||
help="Show version information and exit.",
|
||||
)
|
||||
|
||||
args = parser.parse_args(argv)
|
||||
|
||||
if args.replace is True and args.normalize is False:
|
||||
print("Use --replace in addition of --normalize only.", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
if args.force is True and args.replace is False:
|
||||
print("Use --force in addition of --replace only.", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
if args.threshold < 0.0 or args.threshold > 1.0:
|
||||
print("--threshold VALUE should be between 0. AND 1.", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
x_ = []
|
||||
|
||||
for my_file in args.files:
|
||||
|
||||
matches = from_fp(my_file, threshold=args.threshold, explain=args.verbose)
|
||||
|
||||
best_guess = matches.best()
|
||||
|
||||
if best_guess is None:
|
||||
print(
|
||||
'Unable to identify originating encoding for "{}". {}'.format(
|
||||
my_file.name,
|
||||
"Maybe try increasing maximum amount of chaos."
|
||||
if args.threshold < 1.0
|
||||
else "",
|
||||
),
|
||||
file=sys.stderr,
|
||||
)
|
||||
x_.append(
|
||||
CliDetectionResult(
|
||||
abspath(my_file.name),
|
||||
None,
|
||||
[],
|
||||
[],
|
||||
"Unknown",
|
||||
[],
|
||||
False,
|
||||
1.0,
|
||||
0.0,
|
||||
None,
|
||||
True,
|
||||
)
|
||||
)
|
||||
else:
|
||||
x_.append(
|
||||
CliDetectionResult(
|
||||
abspath(my_file.name),
|
||||
best_guess.encoding,
|
||||
best_guess.encoding_aliases,
|
||||
[
|
||||
cp
|
||||
for cp in best_guess.could_be_from_charset
|
||||
if cp != best_guess.encoding
|
||||
],
|
||||
best_guess.language,
|
||||
best_guess.alphabets,
|
||||
best_guess.bom,
|
||||
best_guess.percent_chaos,
|
||||
best_guess.percent_coherence,
|
||||
None,
|
||||
True,
|
||||
)
|
||||
)
|
||||
|
||||
if len(matches) > 1 and args.alternatives:
|
||||
for el in matches:
|
||||
if el != best_guess:
|
||||
x_.append(
|
||||
CliDetectionResult(
|
||||
abspath(my_file.name),
|
||||
el.encoding,
|
||||
el.encoding_aliases,
|
||||
[
|
||||
cp
|
||||
for cp in el.could_be_from_charset
|
||||
if cp != el.encoding
|
||||
],
|
||||
el.language,
|
||||
el.alphabets,
|
||||
el.bom,
|
||||
el.percent_chaos,
|
||||
el.percent_coherence,
|
||||
None,
|
||||
False,
|
||||
)
|
||||
)
|
||||
|
||||
if args.normalize is True:
|
||||
|
||||
if best_guess.encoding.startswith("utf") is True:
|
||||
print(
|
||||
'"{}" file does not need to be normalized, as it already came from unicode.'.format(
|
||||
my_file.name
|
||||
),
|
||||
file=sys.stderr,
|
||||
)
|
||||
if my_file.closed is False:
|
||||
my_file.close()
|
||||
continue
|
||||
|
||||
o_: List[str] = my_file.name.split(".")
|
||||
|
||||
if args.replace is False:
|
||||
o_.insert(-1, best_guess.encoding)
|
||||
if my_file.closed is False:
|
||||
my_file.close()
|
||||
elif (
|
||||
args.force is False
|
||||
and query_yes_no(
|
||||
'Are you sure to normalize "{}" by replacing it ?'.format(
|
||||
my_file.name
|
||||
),
|
||||
"no",
|
||||
)
|
||||
is False
|
||||
):
|
||||
if my_file.closed is False:
|
||||
my_file.close()
|
||||
continue
|
||||
|
||||
try:
|
||||
x_[0].unicode_path = abspath("./{}".format(".".join(o_)))
|
||||
|
||||
with open(x_[0].unicode_path, "w", encoding="utf-8") as fp:
|
||||
fp.write(str(best_guess))
|
||||
except IOError as e:
|
||||
print(str(e), file=sys.stderr)
|
||||
if my_file.closed is False:
|
||||
my_file.close()
|
||||
return 2
|
||||
|
||||
if my_file.closed is False:
|
||||
my_file.close()
|
||||
|
||||
if args.minimal is False:
|
||||
print(
|
||||
dumps(
|
||||
[el.__dict__ for el in x_] if len(x_) > 1 else x_[0].__dict__,
|
||||
ensure_ascii=True,
|
||||
indent=4,
|
||||
)
|
||||
)
|
||||
else:
|
||||
for my_file in args.files:
|
||||
print(
|
||||
", ".join(
|
||||
[
|
||||
el.encoding or "undefined"
|
||||
for el in x_
|
||||
if el.path == abspath(my_file.name)
|
||||
]
|
||||
)
|
||||
)
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
cli_detect()
|
||||
497
_vendor/charset_normalizer/constant.py
Normal file
497
_vendor/charset_normalizer/constant.py
Normal file
@@ -0,0 +1,497 @@
|
||||
from codecs import BOM_UTF8, BOM_UTF16_BE, BOM_UTF16_LE, BOM_UTF32_BE, BOM_UTF32_LE
|
||||
from encodings.aliases import aliases
|
||||
from re import IGNORECASE, compile as re_compile
|
||||
from typing import Dict, List, Set, Union
|
||||
|
||||
from .assets import FREQUENCIES
|
||||
|
||||
# Contain for each eligible encoding a list of/item bytes SIG/BOM
|
||||
ENCODING_MARKS: Dict[str, Union[bytes, List[bytes]]] = {
|
||||
"utf_8": BOM_UTF8,
|
||||
"utf_7": [
|
||||
b"\x2b\x2f\x76\x38",
|
||||
b"\x2b\x2f\x76\x39",
|
||||
b"\x2b\x2f\x76\x2b",
|
||||
b"\x2b\x2f\x76\x2f",
|
||||
b"\x2b\x2f\x76\x38\x2d",
|
||||
],
|
||||
"gb18030": b"\x84\x31\x95\x33",
|
||||
"utf_32": [BOM_UTF32_BE, BOM_UTF32_LE],
|
||||
"utf_16": [BOM_UTF16_BE, BOM_UTF16_LE],
|
||||
}
|
||||
|
||||
TOO_SMALL_SEQUENCE: int = 32
|
||||
TOO_BIG_SEQUENCE: int = int(10e6)
|
||||
|
||||
UTF8_MAXIMAL_ALLOCATION: int = 1112064
|
||||
|
||||
UNICODE_RANGES_COMBINED: Dict[str, range] = {
|
||||
"Control character": range(31 + 1),
|
||||
"Basic Latin": range(32, 127 + 1),
|
||||
"Latin-1 Supplement": range(128, 255 + 1),
|
||||
"Latin Extended-A": range(256, 383 + 1),
|
||||
"Latin Extended-B": range(384, 591 + 1),
|
||||
"IPA Extensions": range(592, 687 + 1),
|
||||
"Spacing Modifier Letters": range(688, 767 + 1),
|
||||
"Combining Diacritical Marks": range(768, 879 + 1),
|
||||
"Greek and Coptic": range(880, 1023 + 1),
|
||||
"Cyrillic": range(1024, 1279 + 1),
|
||||
"Cyrillic Supplement": range(1280, 1327 + 1),
|
||||
"Armenian": range(1328, 1423 + 1),
|
||||
"Hebrew": range(1424, 1535 + 1),
|
||||
"Arabic": range(1536, 1791 + 1),
|
||||
"Syriac": range(1792, 1871 + 1),
|
||||
"Arabic Supplement": range(1872, 1919 + 1),
|
||||
"Thaana": range(1920, 1983 + 1),
|
||||
"NKo": range(1984, 2047 + 1),
|
||||
"Samaritan": range(2048, 2111 + 1),
|
||||
"Mandaic": range(2112, 2143 + 1),
|
||||
"Syriac Supplement": range(2144, 2159 + 1),
|
||||
"Arabic Extended-A": range(2208, 2303 + 1),
|
||||
"Devanagari": range(2304, 2431 + 1),
|
||||
"Bengali": range(2432, 2559 + 1),
|
||||
"Gurmukhi": range(2560, 2687 + 1),
|
||||
"Gujarati": range(2688, 2815 + 1),
|
||||
"Oriya": range(2816, 2943 + 1),
|
||||
"Tamil": range(2944, 3071 + 1),
|
||||
"Telugu": range(3072, 3199 + 1),
|
||||
"Kannada": range(3200, 3327 + 1),
|
||||
"Malayalam": range(3328, 3455 + 1),
|
||||
"Sinhala": range(3456, 3583 + 1),
|
||||
"Thai": range(3584, 3711 + 1),
|
||||
"Lao": range(3712, 3839 + 1),
|
||||
"Tibetan": range(3840, 4095 + 1),
|
||||
"Myanmar": range(4096, 4255 + 1),
|
||||
"Georgian": range(4256, 4351 + 1),
|
||||
"Hangul Jamo": range(4352, 4607 + 1),
|
||||
"Ethiopic": range(4608, 4991 + 1),
|
||||
"Ethiopic Supplement": range(4992, 5023 + 1),
|
||||
"Cherokee": range(5024, 5119 + 1),
|
||||
"Unified Canadian Aboriginal Syllabics": range(5120, 5759 + 1),
|
||||
"Ogham": range(5760, 5791 + 1),
|
||||
"Runic": range(5792, 5887 + 1),
|
||||
"Tagalog": range(5888, 5919 + 1),
|
||||
"Hanunoo": range(5920, 5951 + 1),
|
||||
"Buhid": range(5952, 5983 + 1),
|
||||
"Tagbanwa": range(5984, 6015 + 1),
|
||||
"Khmer": range(6016, 6143 + 1),
|
||||
"Mongolian": range(6144, 6319 + 1),
|
||||
"Unified Canadian Aboriginal Syllabics Extended": range(6320, 6399 + 1),
|
||||
"Limbu": range(6400, 6479 + 1),
|
||||
"Tai Le": range(6480, 6527 + 1),
|
||||
"New Tai Lue": range(6528, 6623 + 1),
|
||||
"Khmer Symbols": range(6624, 6655 + 1),
|
||||
"Buginese": range(6656, 6687 + 1),
|
||||
"Tai Tham": range(6688, 6831 + 1),
|
||||
"Combining Diacritical Marks Extended": range(6832, 6911 + 1),
|
||||
"Balinese": range(6912, 7039 + 1),
|
||||
"Sundanese": range(7040, 7103 + 1),
|
||||
"Batak": range(7104, 7167 + 1),
|
||||
"Lepcha": range(7168, 7247 + 1),
|
||||
"Ol Chiki": range(7248, 7295 + 1),
|
||||
"Cyrillic Extended C": range(7296, 7311 + 1),
|
||||
"Sundanese Supplement": range(7360, 7375 + 1),
|
||||
"Vedic Extensions": range(7376, 7423 + 1),
|
||||
"Phonetic Extensions": range(7424, 7551 + 1),
|
||||
"Phonetic Extensions Supplement": range(7552, 7615 + 1),
|
||||
"Combining Diacritical Marks Supplement": range(7616, 7679 + 1),
|
||||
"Latin Extended Additional": range(7680, 7935 + 1),
|
||||
"Greek Extended": range(7936, 8191 + 1),
|
||||
"General Punctuation": range(8192, 8303 + 1),
|
||||
"Superscripts and Subscripts": range(8304, 8351 + 1),
|
||||
"Currency Symbols": range(8352, 8399 + 1),
|
||||
"Combining Diacritical Marks for Symbols": range(8400, 8447 + 1),
|
||||
"Letterlike Symbols": range(8448, 8527 + 1),
|
||||
"Number Forms": range(8528, 8591 + 1),
|
||||
"Arrows": range(8592, 8703 + 1),
|
||||
"Mathematical Operators": range(8704, 8959 + 1),
|
||||
"Miscellaneous Technical": range(8960, 9215 + 1),
|
||||
"Control Pictures": range(9216, 9279 + 1),
|
||||
"Optical Character Recognition": range(9280, 9311 + 1),
|
||||
"Enclosed Alphanumerics": range(9312, 9471 + 1),
|
||||
"Box Drawing": range(9472, 9599 + 1),
|
||||
"Block Elements": range(9600, 9631 + 1),
|
||||
"Geometric Shapes": range(9632, 9727 + 1),
|
||||
"Miscellaneous Symbols": range(9728, 9983 + 1),
|
||||
"Dingbats": range(9984, 10175 + 1),
|
||||
"Miscellaneous Mathematical Symbols-A": range(10176, 10223 + 1),
|
||||
"Supplemental Arrows-A": range(10224, 10239 + 1),
|
||||
"Braille Patterns": range(10240, 10495 + 1),
|
||||
"Supplemental Arrows-B": range(10496, 10623 + 1),
|
||||
"Miscellaneous Mathematical Symbols-B": range(10624, 10751 + 1),
|
||||
"Supplemental Mathematical Operators": range(10752, 11007 + 1),
|
||||
"Miscellaneous Symbols and Arrows": range(11008, 11263 + 1),
|
||||
"Glagolitic": range(11264, 11359 + 1),
|
||||
"Latin Extended-C": range(11360, 11391 + 1),
|
||||
"Coptic": range(11392, 11519 + 1),
|
||||
"Georgian Supplement": range(11520, 11567 + 1),
|
||||
"Tifinagh": range(11568, 11647 + 1),
|
||||
"Ethiopic Extended": range(11648, 11743 + 1),
|
||||
"Cyrillic Extended-A": range(11744, 11775 + 1),
|
||||
"Supplemental Punctuation": range(11776, 11903 + 1),
|
||||
"CJK Radicals Supplement": range(11904, 12031 + 1),
|
||||
"Kangxi Radicals": range(12032, 12255 + 1),
|
||||
"Ideographic Description Characters": range(12272, 12287 + 1),
|
||||
"CJK Symbols and Punctuation": range(12288, 12351 + 1),
|
||||
"Hiragana": range(12352, 12447 + 1),
|
||||
"Katakana": range(12448, 12543 + 1),
|
||||
"Bopomofo": range(12544, 12591 + 1),
|
||||
"Hangul Compatibility Jamo": range(12592, 12687 + 1),
|
||||
"Kanbun": range(12688, 12703 + 1),
|
||||
"Bopomofo Extended": range(12704, 12735 + 1),
|
||||
"CJK Strokes": range(12736, 12783 + 1),
|
||||
"Katakana Phonetic Extensions": range(12784, 12799 + 1),
|
||||
"Enclosed CJK Letters and Months": range(12800, 13055 + 1),
|
||||
"CJK Compatibility": range(13056, 13311 + 1),
|
||||
"CJK Unified Ideographs Extension A": range(13312, 19903 + 1),
|
||||
"Yijing Hexagram Symbols": range(19904, 19967 + 1),
|
||||
"CJK Unified Ideographs": range(19968, 40959 + 1),
|
||||
"Yi Syllables": range(40960, 42127 + 1),
|
||||
"Yi Radicals": range(42128, 42191 + 1),
|
||||
"Lisu": range(42192, 42239 + 1),
|
||||
"Vai": range(42240, 42559 + 1),
|
||||
"Cyrillic Extended-B": range(42560, 42655 + 1),
|
||||
"Bamum": range(42656, 42751 + 1),
|
||||
"Modifier Tone Letters": range(42752, 42783 + 1),
|
||||
"Latin Extended-D": range(42784, 43007 + 1),
|
||||
"Syloti Nagri": range(43008, 43055 + 1),
|
||||
"Common Indic Number Forms": range(43056, 43071 + 1),
|
||||
"Phags-pa": range(43072, 43135 + 1),
|
||||
"Saurashtra": range(43136, 43231 + 1),
|
||||
"Devanagari Extended": range(43232, 43263 + 1),
|
||||
"Kayah Li": range(43264, 43311 + 1),
|
||||
"Rejang": range(43312, 43359 + 1),
|
||||
"Hangul Jamo Extended-A": range(43360, 43391 + 1),
|
||||
"Javanese": range(43392, 43487 + 1),
|
||||
"Myanmar Extended-B": range(43488, 43519 + 1),
|
||||
"Cham": range(43520, 43615 + 1),
|
||||
"Myanmar Extended-A": range(43616, 43647 + 1),
|
||||
"Tai Viet": range(43648, 43743 + 1),
|
||||
"Meetei Mayek Extensions": range(43744, 43775 + 1),
|
||||
"Ethiopic Extended-A": range(43776, 43823 + 1),
|
||||
"Latin Extended-E": range(43824, 43887 + 1),
|
||||
"Cherokee Supplement": range(43888, 43967 + 1),
|
||||
"Meetei Mayek": range(43968, 44031 + 1),
|
||||
"Hangul Syllables": range(44032, 55215 + 1),
|
||||
"Hangul Jamo Extended-B": range(55216, 55295 + 1),
|
||||
"High Surrogates": range(55296, 56191 + 1),
|
||||
"High Private Use Surrogates": range(56192, 56319 + 1),
|
||||
"Low Surrogates": range(56320, 57343 + 1),
|
||||
"Private Use Area": range(57344, 63743 + 1),
|
||||
"CJK Compatibility Ideographs": range(63744, 64255 + 1),
|
||||
"Alphabetic Presentation Forms": range(64256, 64335 + 1),
|
||||
"Arabic Presentation Forms-A": range(64336, 65023 + 1),
|
||||
"Variation Selectors": range(65024, 65039 + 1),
|
||||
"Vertical Forms": range(65040, 65055 + 1),
|
||||
"Combining Half Marks": range(65056, 65071 + 1),
|
||||
"CJK Compatibility Forms": range(65072, 65103 + 1),
|
||||
"Small Form Variants": range(65104, 65135 + 1),
|
||||
"Arabic Presentation Forms-B": range(65136, 65279 + 1),
|
||||
"Halfwidth and Fullwidth Forms": range(65280, 65519 + 1),
|
||||
"Specials": range(65520, 65535 + 1),
|
||||
"Linear B Syllabary": range(65536, 65663 + 1),
|
||||
"Linear B Ideograms": range(65664, 65791 + 1),
|
||||
"Aegean Numbers": range(65792, 65855 + 1),
|
||||
"Ancient Greek Numbers": range(65856, 65935 + 1),
|
||||
"Ancient Symbols": range(65936, 65999 + 1),
|
||||
"Phaistos Disc": range(66000, 66047 + 1),
|
||||
"Lycian": range(66176, 66207 + 1),
|
||||
"Carian": range(66208, 66271 + 1),
|
||||
"Coptic Epact Numbers": range(66272, 66303 + 1),
|
||||
"Old Italic": range(66304, 66351 + 1),
|
||||
"Gothic": range(66352, 66383 + 1),
|
||||
"Old Permic": range(66384, 66431 + 1),
|
||||
"Ugaritic": range(66432, 66463 + 1),
|
||||
"Old Persian": range(66464, 66527 + 1),
|
||||
"Deseret": range(66560, 66639 + 1),
|
||||
"Shavian": range(66640, 66687 + 1),
|
||||
"Osmanya": range(66688, 66735 + 1),
|
||||
"Osage": range(66736, 66815 + 1),
|
||||
"Elbasan": range(66816, 66863 + 1),
|
||||
"Caucasian Albanian": range(66864, 66927 + 1),
|
||||
"Linear A": range(67072, 67455 + 1),
|
||||
"Cypriot Syllabary": range(67584, 67647 + 1),
|
||||
"Imperial Aramaic": range(67648, 67679 + 1),
|
||||
"Palmyrene": range(67680, 67711 + 1),
|
||||
"Nabataean": range(67712, 67759 + 1),
|
||||
"Hatran": range(67808, 67839 + 1),
|
||||
"Phoenician": range(67840, 67871 + 1),
|
||||
"Lydian": range(67872, 67903 + 1),
|
||||
"Meroitic Hieroglyphs": range(67968, 67999 + 1),
|
||||
"Meroitic Cursive": range(68000, 68095 + 1),
|
||||
"Kharoshthi": range(68096, 68191 + 1),
|
||||
"Old South Arabian": range(68192, 68223 + 1),
|
||||
"Old North Arabian": range(68224, 68255 + 1),
|
||||
"Manichaean": range(68288, 68351 + 1),
|
||||
"Avestan": range(68352, 68415 + 1),
|
||||
"Inscriptional Parthian": range(68416, 68447 + 1),
|
||||
"Inscriptional Pahlavi": range(68448, 68479 + 1),
|
||||
"Psalter Pahlavi": range(68480, 68527 + 1),
|
||||
"Old Turkic": range(68608, 68687 + 1),
|
||||
"Old Hungarian": range(68736, 68863 + 1),
|
||||
"Rumi Numeral Symbols": range(69216, 69247 + 1),
|
||||
"Brahmi": range(69632, 69759 + 1),
|
||||
"Kaithi": range(69760, 69839 + 1),
|
||||
"Sora Sompeng": range(69840, 69887 + 1),
|
||||
"Chakma": range(69888, 69967 + 1),
|
||||
"Mahajani": range(69968, 70015 + 1),
|
||||
"Sharada": range(70016, 70111 + 1),
|
||||
"Sinhala Archaic Numbers": range(70112, 70143 + 1),
|
||||
"Khojki": range(70144, 70223 + 1),
|
||||
"Multani": range(70272, 70319 + 1),
|
||||
"Khudawadi": range(70320, 70399 + 1),
|
||||
"Grantha": range(70400, 70527 + 1),
|
||||
"Newa": range(70656, 70783 + 1),
|
||||
"Tirhuta": range(70784, 70879 + 1),
|
||||
"Siddham": range(71040, 71167 + 1),
|
||||
"Modi": range(71168, 71263 + 1),
|
||||
"Mongolian Supplement": range(71264, 71295 + 1),
|
||||
"Takri": range(71296, 71375 + 1),
|
||||
"Ahom": range(71424, 71487 + 1),
|
||||
"Warang Citi": range(71840, 71935 + 1),
|
||||
"Zanabazar Square": range(72192, 72271 + 1),
|
||||
"Soyombo": range(72272, 72367 + 1),
|
||||
"Pau Cin Hau": range(72384, 72447 + 1),
|
||||
"Bhaiksuki": range(72704, 72815 + 1),
|
||||
"Marchen": range(72816, 72895 + 1),
|
||||
"Masaram Gondi": range(72960, 73055 + 1),
|
||||
"Cuneiform": range(73728, 74751 + 1),
|
||||
"Cuneiform Numbers and Punctuation": range(74752, 74879 + 1),
|
||||
"Early Dynastic Cuneiform": range(74880, 75087 + 1),
|
||||
"Egyptian Hieroglyphs": range(77824, 78895 + 1),
|
||||
"Anatolian Hieroglyphs": range(82944, 83583 + 1),
|
||||
"Bamum Supplement": range(92160, 92735 + 1),
|
||||
"Mro": range(92736, 92783 + 1),
|
||||
"Bassa Vah": range(92880, 92927 + 1),
|
||||
"Pahawh Hmong": range(92928, 93071 + 1),
|
||||
"Miao": range(93952, 94111 + 1),
|
||||
"Ideographic Symbols and Punctuation": range(94176, 94207 + 1),
|
||||
"Tangut": range(94208, 100351 + 1),
|
||||
"Tangut Components": range(100352, 101119 + 1),
|
||||
"Kana Supplement": range(110592, 110847 + 1),
|
||||
"Kana Extended-A": range(110848, 110895 + 1),
|
||||
"Nushu": range(110960, 111359 + 1),
|
||||
"Duployan": range(113664, 113823 + 1),
|
||||
"Shorthand Format Controls": range(113824, 113839 + 1),
|
||||
"Byzantine Musical Symbols": range(118784, 119039 + 1),
|
||||
"Musical Symbols": range(119040, 119295 + 1),
|
||||
"Ancient Greek Musical Notation": range(119296, 119375 + 1),
|
||||
"Tai Xuan Jing Symbols": range(119552, 119647 + 1),
|
||||
"Counting Rod Numerals": range(119648, 119679 + 1),
|
||||
"Mathematical Alphanumeric Symbols": range(119808, 120831 + 1),
|
||||
"Sutton SignWriting": range(120832, 121519 + 1),
|
||||
"Glagolitic Supplement": range(122880, 122927 + 1),
|
||||
"Mende Kikakui": range(124928, 125151 + 1),
|
||||
"Adlam": range(125184, 125279 + 1),
|
||||
"Arabic Mathematical Alphabetic Symbols": range(126464, 126719 + 1),
|
||||
"Mahjong Tiles": range(126976, 127023 + 1),
|
||||
"Domino Tiles": range(127024, 127135 + 1),
|
||||
"Playing Cards": range(127136, 127231 + 1),
|
||||
"Enclosed Alphanumeric Supplement": range(127232, 127487 + 1),
|
||||
"Enclosed Ideographic Supplement": range(127488, 127743 + 1),
|
||||
"Miscellaneous Symbols and Pictographs": range(127744, 128511 + 1),
|
||||
"Emoticons range(Emoji)": range(128512, 128591 + 1),
|
||||
"Ornamental Dingbats": range(128592, 128639 + 1),
|
||||
"Transport and Map Symbols": range(128640, 128767 + 1),
|
||||
"Alchemical Symbols": range(128768, 128895 + 1),
|
||||
"Geometric Shapes Extended": range(128896, 129023 + 1),
|
||||
"Supplemental Arrows-C": range(129024, 129279 + 1),
|
||||
"Supplemental Symbols and Pictographs": range(129280, 129535 + 1),
|
||||
"CJK Unified Ideographs Extension B": range(131072, 173791 + 1),
|
||||
"CJK Unified Ideographs Extension C": range(173824, 177983 + 1),
|
||||
"CJK Unified Ideographs Extension D": range(177984, 178207 + 1),
|
||||
"CJK Unified Ideographs Extension E": range(178208, 183983 + 1),
|
||||
"CJK Unified Ideographs Extension F": range(183984, 191471 + 1),
|
||||
"CJK Compatibility Ideographs Supplement": range(194560, 195103 + 1),
|
||||
"Tags": range(917504, 917631 + 1),
|
||||
"Variation Selectors Supplement": range(917760, 917999 + 1),
|
||||
}
|
||||
|
||||
|
||||
UNICODE_SECONDARY_RANGE_KEYWORD: List[str] = [
|
||||
"Supplement",
|
||||
"Extended",
|
||||
"Extensions",
|
||||
"Modifier",
|
||||
"Marks",
|
||||
"Punctuation",
|
||||
"Symbols",
|
||||
"Forms",
|
||||
"Operators",
|
||||
"Miscellaneous",
|
||||
"Drawing",
|
||||
"Block",
|
||||
"Shapes",
|
||||
"Supplemental",
|
||||
"Tags",
|
||||
]
|
||||
|
||||
RE_POSSIBLE_ENCODING_INDICATION = re_compile(
|
||||
r"(?:(?:encoding)|(?:charset)|(?:coding))(?:[\:= ]{1,10})(?:[\"\']?)([a-zA-Z0-9\-_]+)(?:[\"\']?)",
|
||||
IGNORECASE,
|
||||
)
|
||||
|
||||
IANA_SUPPORTED: List[str] = sorted(
|
||||
filter(
|
||||
lambda x: x.endswith("_codec") is False
|
||||
and x not in {"rot_13", "tactis", "mbcs"},
|
||||
list(set(aliases.values())),
|
||||
)
|
||||
)
|
||||
|
||||
IANA_SUPPORTED_COUNT: int = len(IANA_SUPPORTED)
|
||||
|
||||
# pre-computed code page that are similar using the function cp_similarity.
|
||||
IANA_SUPPORTED_SIMILAR: Dict[str, List[str]] = {
|
||||
"cp037": ["cp1026", "cp1140", "cp273", "cp500"],
|
||||
"cp1026": ["cp037", "cp1140", "cp273", "cp500"],
|
||||
"cp1125": ["cp866"],
|
||||
"cp1140": ["cp037", "cp1026", "cp273", "cp500"],
|
||||
"cp1250": ["iso8859_2"],
|
||||
"cp1251": ["kz1048", "ptcp154"],
|
||||
"cp1252": ["iso8859_15", "iso8859_9", "latin_1"],
|
||||
"cp1253": ["iso8859_7"],
|
||||
"cp1254": ["iso8859_15", "iso8859_9", "latin_1"],
|
||||
"cp1257": ["iso8859_13"],
|
||||
"cp273": ["cp037", "cp1026", "cp1140", "cp500"],
|
||||
"cp437": ["cp850", "cp858", "cp860", "cp861", "cp862", "cp863", "cp865"],
|
||||
"cp500": ["cp037", "cp1026", "cp1140", "cp273"],
|
||||
"cp850": ["cp437", "cp857", "cp858", "cp865"],
|
||||
"cp857": ["cp850", "cp858", "cp865"],
|
||||
"cp858": ["cp437", "cp850", "cp857", "cp865"],
|
||||
"cp860": ["cp437", "cp861", "cp862", "cp863", "cp865"],
|
||||
"cp861": ["cp437", "cp860", "cp862", "cp863", "cp865"],
|
||||
"cp862": ["cp437", "cp860", "cp861", "cp863", "cp865"],
|
||||
"cp863": ["cp437", "cp860", "cp861", "cp862", "cp865"],
|
||||
"cp865": ["cp437", "cp850", "cp857", "cp858", "cp860", "cp861", "cp862", "cp863"],
|
||||
"cp866": ["cp1125"],
|
||||
"iso8859_10": ["iso8859_14", "iso8859_15", "iso8859_4", "iso8859_9", "latin_1"],
|
||||
"iso8859_11": ["tis_620"],
|
||||
"iso8859_13": ["cp1257"],
|
||||
"iso8859_14": [
|
||||
"iso8859_10",
|
||||
"iso8859_15",
|
||||
"iso8859_16",
|
||||
"iso8859_3",
|
||||
"iso8859_9",
|
||||
"latin_1",
|
||||
],
|
||||
"iso8859_15": [
|
||||
"cp1252",
|
||||
"cp1254",
|
||||
"iso8859_10",
|
||||
"iso8859_14",
|
||||
"iso8859_16",
|
||||
"iso8859_3",
|
||||
"iso8859_9",
|
||||
"latin_1",
|
||||
],
|
||||
"iso8859_16": [
|
||||
"iso8859_14",
|
||||
"iso8859_15",
|
||||
"iso8859_2",
|
||||
"iso8859_3",
|
||||
"iso8859_9",
|
||||
"latin_1",
|
||||
],
|
||||
"iso8859_2": ["cp1250", "iso8859_16", "iso8859_4"],
|
||||
"iso8859_3": ["iso8859_14", "iso8859_15", "iso8859_16", "iso8859_9", "latin_1"],
|
||||
"iso8859_4": ["iso8859_10", "iso8859_2", "iso8859_9", "latin_1"],
|
||||
"iso8859_7": ["cp1253"],
|
||||
"iso8859_9": [
|
||||
"cp1252",
|
||||
"cp1254",
|
||||
"cp1258",
|
||||
"iso8859_10",
|
||||
"iso8859_14",
|
||||
"iso8859_15",
|
||||
"iso8859_16",
|
||||
"iso8859_3",
|
||||
"iso8859_4",
|
||||
"latin_1",
|
||||
],
|
||||
"kz1048": ["cp1251", "ptcp154"],
|
||||
"latin_1": [
|
||||
"cp1252",
|
||||
"cp1254",
|
||||
"cp1258",
|
||||
"iso8859_10",
|
||||
"iso8859_14",
|
||||
"iso8859_15",
|
||||
"iso8859_16",
|
||||
"iso8859_3",
|
||||
"iso8859_4",
|
||||
"iso8859_9",
|
||||
],
|
||||
"mac_iceland": ["mac_roman", "mac_turkish"],
|
||||
"mac_roman": ["mac_iceland", "mac_turkish"],
|
||||
"mac_turkish": ["mac_iceland", "mac_roman"],
|
||||
"ptcp154": ["cp1251", "kz1048"],
|
||||
"tis_620": ["iso8859_11"],
|
||||
}
|
||||
|
||||
|
||||
CHARDET_CORRESPONDENCE: Dict[str, str] = {
|
||||
"iso2022_kr": "ISO-2022-KR",
|
||||
"iso2022_jp": "ISO-2022-JP",
|
||||
"euc_kr": "EUC-KR",
|
||||
"tis_620": "TIS-620",
|
||||
"utf_32": "UTF-32",
|
||||
"euc_jp": "EUC-JP",
|
||||
"koi8_r": "KOI8-R",
|
||||
"iso8859_1": "ISO-8859-1",
|
||||
"iso8859_2": "ISO-8859-2",
|
||||
"iso8859_5": "ISO-8859-5",
|
||||
"iso8859_6": "ISO-8859-6",
|
||||
"iso8859_7": "ISO-8859-7",
|
||||
"iso8859_8": "ISO-8859-8",
|
||||
"utf_16": "UTF-16",
|
||||
"cp855": "IBM855",
|
||||
"mac_cyrillic": "MacCyrillic",
|
||||
"gb2312": "GB2312",
|
||||
"gb18030": "GB18030",
|
||||
"cp932": "CP932",
|
||||
"cp866": "IBM866",
|
||||
"utf_8": "utf-8",
|
||||
"utf_8_sig": "UTF-8-SIG",
|
||||
"shift_jis": "SHIFT_JIS",
|
||||
"big5": "Big5",
|
||||
"cp1250": "windows-1250",
|
||||
"cp1251": "windows-1251",
|
||||
"cp1252": "Windows-1252",
|
||||
"cp1253": "windows-1253",
|
||||
"cp1255": "windows-1255",
|
||||
"cp1256": "windows-1256",
|
||||
"cp1254": "Windows-1254",
|
||||
"cp949": "CP949",
|
||||
}
|
||||
|
||||
|
||||
COMMON_SAFE_ASCII_CHARACTERS: Set[str] = {
|
||||
"<",
|
||||
">",
|
||||
"=",
|
||||
":",
|
||||
"/",
|
||||
"&",
|
||||
";",
|
||||
"{",
|
||||
"}",
|
||||
"[",
|
||||
"]",
|
||||
",",
|
||||
"|",
|
||||
'"',
|
||||
"-",
|
||||
}
|
||||
|
||||
|
||||
KO_NAMES: Set[str] = {"johab", "cp949", "euc_kr"}
|
||||
ZH_NAMES: Set[str] = {"big5", "cp950", "big5hkscs", "hz"}
|
||||
|
||||
NOT_PRINTABLE_PATTERN = re_compile(r"[0-9\W\n\r\t]+")
|
||||
|
||||
LANGUAGE_SUPPORTED_COUNT: int = len(FREQUENCIES)
|
||||
|
||||
# Logging LEVEL bellow DEBUG
|
||||
TRACE: int = 5
|
||||
95
_vendor/charset_normalizer/legacy.py
Normal file
95
_vendor/charset_normalizer/legacy.py
Normal file
@@ -0,0 +1,95 @@
|
||||
import warnings
|
||||
from typing import Dict, Optional, Union
|
||||
|
||||
from .api import from_bytes, from_fp, from_path, normalize
|
||||
from .constant import CHARDET_CORRESPONDENCE
|
||||
from .models import CharsetMatch, CharsetMatches
|
||||
|
||||
|
||||
def detect(byte_str: bytes) -> Dict[str, Optional[Union[str, float]]]:
|
||||
"""
|
||||
chardet legacy method
|
||||
Detect the encoding of the given byte string. It should be mostly backward-compatible.
|
||||
Encoding name will match Chardet own writing whenever possible. (Not on encoding name unsupported by it)
|
||||
This function is deprecated and should be used to migrate your project easily, consult the documentation for
|
||||
further information. Not planned for removal.
|
||||
|
||||
:param byte_str: The byte sequence to examine.
|
||||
"""
|
||||
if not isinstance(byte_str, (bytearray, bytes)):
|
||||
raise TypeError( # pragma: nocover
|
||||
"Expected object of type bytes or bytearray, got: "
|
||||
"{0}".format(type(byte_str))
|
||||
)
|
||||
|
||||
if isinstance(byte_str, bytearray):
|
||||
byte_str = bytes(byte_str)
|
||||
|
||||
r = from_bytes(byte_str).best()
|
||||
|
||||
encoding = r.encoding if r is not None else None
|
||||
language = r.language if r is not None and r.language != "Unknown" else ""
|
||||
confidence = 1.0 - r.chaos if r is not None else None
|
||||
|
||||
# Note: CharsetNormalizer does not return 'UTF-8-SIG' as the sig get stripped in the detection/normalization process
|
||||
# but chardet does return 'utf-8-sig' and it is a valid codec name.
|
||||
if r is not None and encoding == "utf_8" and r.bom:
|
||||
encoding += "_sig"
|
||||
|
||||
return {
|
||||
"encoding": encoding
|
||||
if encoding not in CHARDET_CORRESPONDENCE
|
||||
else CHARDET_CORRESPONDENCE[encoding],
|
||||
"language": language,
|
||||
"confidence": confidence,
|
||||
}
|
||||
|
||||
|
||||
class CharsetNormalizerMatch(CharsetMatch):
|
||||
pass
|
||||
|
||||
|
||||
class CharsetNormalizerMatches(CharsetMatches):
|
||||
@staticmethod
|
||||
def from_fp(*args, **kwargs): # type: ignore
|
||||
warnings.warn( # pragma: nocover
|
||||
"staticmethod from_fp, from_bytes, from_path and normalize are deprecated "
|
||||
"and scheduled to be removed in 3.0",
|
||||
DeprecationWarning,
|
||||
)
|
||||
return from_fp(*args, **kwargs) # pragma: nocover
|
||||
|
||||
@staticmethod
|
||||
def from_bytes(*args, **kwargs): # type: ignore
|
||||
warnings.warn( # pragma: nocover
|
||||
"staticmethod from_fp, from_bytes, from_path and normalize are deprecated "
|
||||
"and scheduled to be removed in 3.0",
|
||||
DeprecationWarning,
|
||||
)
|
||||
return from_bytes(*args, **kwargs) # pragma: nocover
|
||||
|
||||
@staticmethod
|
||||
def from_path(*args, **kwargs): # type: ignore
|
||||
warnings.warn( # pragma: nocover
|
||||
"staticmethod from_fp, from_bytes, from_path and normalize are deprecated "
|
||||
"and scheduled to be removed in 3.0",
|
||||
DeprecationWarning,
|
||||
)
|
||||
return from_path(*args, **kwargs) # pragma: nocover
|
||||
|
||||
@staticmethod
|
||||
def normalize(*args, **kwargs): # type: ignore
|
||||
warnings.warn( # pragma: nocover
|
||||
"staticmethod from_fp, from_bytes, from_path and normalize are deprecated "
|
||||
"and scheduled to be removed in 3.0",
|
||||
DeprecationWarning,
|
||||
)
|
||||
return normalize(*args, **kwargs) # pragma: nocover
|
||||
|
||||
|
||||
class CharsetDetector(CharsetNormalizerMatches):
|
||||
pass
|
||||
|
||||
|
||||
class CharsetDoctor(CharsetNormalizerMatches):
|
||||
pass
|
||||
553
_vendor/charset_normalizer/md.py
Normal file
553
_vendor/charset_normalizer/md.py
Normal file
@@ -0,0 +1,553 @@
|
||||
from functools import lru_cache
|
||||
from typing import List, Optional
|
||||
|
||||
from .constant import COMMON_SAFE_ASCII_CHARACTERS, UNICODE_SECONDARY_RANGE_KEYWORD
|
||||
from .utils import (
|
||||
is_accentuated,
|
||||
is_ascii,
|
||||
is_case_variable,
|
||||
is_cjk,
|
||||
is_emoticon,
|
||||
is_hangul,
|
||||
is_hiragana,
|
||||
is_katakana,
|
||||
is_latin,
|
||||
is_punctuation,
|
||||
is_separator,
|
||||
is_symbol,
|
||||
is_thai,
|
||||
is_unprintable,
|
||||
remove_accent,
|
||||
unicode_range,
|
||||
)
|
||||
|
||||
|
||||
class MessDetectorPlugin:
|
||||
"""
|
||||
Base abstract class used for mess detection plugins.
|
||||
All detectors MUST extend and implement given methods.
|
||||
"""
|
||||
|
||||
def eligible(self, character: str) -> bool:
|
||||
"""
|
||||
Determine if given character should be fed in.
|
||||
"""
|
||||
raise NotImplementedError # pragma: nocover
|
||||
|
||||
def feed(self, character: str) -> None:
|
||||
"""
|
||||
The main routine to be executed upon character.
|
||||
Insert the logic in witch the text would be considered chaotic.
|
||||
"""
|
||||
raise NotImplementedError # pragma: nocover
|
||||
|
||||
def reset(self) -> None: # pragma: no cover
|
||||
"""
|
||||
Permit to reset the plugin to the initial state.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@property
|
||||
def ratio(self) -> float:
|
||||
"""
|
||||
Compute the chaos ratio based on what your feed() has seen.
|
||||
Must NOT be lower than 0.; No restriction gt 0.
|
||||
"""
|
||||
raise NotImplementedError # pragma: nocover
|
||||
|
||||
|
||||
class TooManySymbolOrPunctuationPlugin(MessDetectorPlugin):
|
||||
def __init__(self) -> None:
|
||||
self._punctuation_count: int = 0
|
||||
self._symbol_count: int = 0
|
||||
self._character_count: int = 0
|
||||
|
||||
self._last_printable_char: Optional[str] = None
|
||||
self._frenzy_symbol_in_word: bool = False
|
||||
|
||||
def eligible(self, character: str) -> bool:
|
||||
return character.isprintable()
|
||||
|
||||
def feed(self, character: str) -> None:
|
||||
self._character_count += 1
|
||||
|
||||
if (
|
||||
character != self._last_printable_char
|
||||
and character not in COMMON_SAFE_ASCII_CHARACTERS
|
||||
):
|
||||
if is_punctuation(character):
|
||||
self._punctuation_count += 1
|
||||
elif (
|
||||
character.isdigit() is False
|
||||
and is_symbol(character)
|
||||
and is_emoticon(character) is False
|
||||
):
|
||||
self._symbol_count += 2
|
||||
|
||||
self._last_printable_char = character
|
||||
|
||||
def reset(self) -> None: # pragma: no cover
|
||||
self._punctuation_count = 0
|
||||
self._character_count = 0
|
||||
self._symbol_count = 0
|
||||
|
||||
@property
|
||||
def ratio(self) -> float:
|
||||
if self._character_count == 0:
|
||||
return 0.0
|
||||
|
||||
ratio_of_punctuation: float = (
|
||||
self._punctuation_count + self._symbol_count
|
||||
) / self._character_count
|
||||
|
||||
return ratio_of_punctuation if ratio_of_punctuation >= 0.3 else 0.0
|
||||
|
||||
|
||||
class TooManyAccentuatedPlugin(MessDetectorPlugin):
|
||||
def __init__(self) -> None:
|
||||
self._character_count: int = 0
|
||||
self._accentuated_count: int = 0
|
||||
|
||||
def eligible(self, character: str) -> bool:
|
||||
return character.isalpha()
|
||||
|
||||
def feed(self, character: str) -> None:
|
||||
self._character_count += 1
|
||||
|
||||
if is_accentuated(character):
|
||||
self._accentuated_count += 1
|
||||
|
||||
def reset(self) -> None: # pragma: no cover
|
||||
self._character_count = 0
|
||||
self._accentuated_count = 0
|
||||
|
||||
@property
|
||||
def ratio(self) -> float:
|
||||
if self._character_count == 0:
|
||||
return 0.0
|
||||
ratio_of_accentuation: float = self._accentuated_count / self._character_count
|
||||
return ratio_of_accentuation if ratio_of_accentuation >= 0.35 else 0.0
|
||||
|
||||
|
||||
class UnprintablePlugin(MessDetectorPlugin):
|
||||
def __init__(self) -> None:
|
||||
self._unprintable_count: int = 0
|
||||
self._character_count: int = 0
|
||||
|
||||
def eligible(self, character: str) -> bool:
|
||||
return True
|
||||
|
||||
def feed(self, character: str) -> None:
|
||||
if is_unprintable(character):
|
||||
self._unprintable_count += 1
|
||||
self._character_count += 1
|
||||
|
||||
def reset(self) -> None: # pragma: no cover
|
||||
self._unprintable_count = 0
|
||||
|
||||
@property
|
||||
def ratio(self) -> float:
|
||||
if self._character_count == 0:
|
||||
return 0.0
|
||||
|
||||
return (self._unprintable_count * 8) / self._character_count
|
||||
|
||||
|
||||
class SuspiciousDuplicateAccentPlugin(MessDetectorPlugin):
|
||||
def __init__(self) -> None:
|
||||
self._successive_count: int = 0
|
||||
self._character_count: int = 0
|
||||
|
||||
self._last_latin_character: Optional[str] = None
|
||||
|
||||
def eligible(self, character: str) -> bool:
|
||||
return character.isalpha() and is_latin(character)
|
||||
|
||||
def feed(self, character: str) -> None:
|
||||
self._character_count += 1
|
||||
if (
|
||||
self._last_latin_character is not None
|
||||
and is_accentuated(character)
|
||||
and is_accentuated(self._last_latin_character)
|
||||
):
|
||||
if character.isupper() and self._last_latin_character.isupper():
|
||||
self._successive_count += 1
|
||||
# Worse if its the same char duplicated with different accent.
|
||||
if remove_accent(character) == remove_accent(self._last_latin_character):
|
||||
self._successive_count += 1
|
||||
self._last_latin_character = character
|
||||
|
||||
def reset(self) -> None: # pragma: no cover
|
||||
self._successive_count = 0
|
||||
self._character_count = 0
|
||||
self._last_latin_character = None
|
||||
|
||||
@property
|
||||
def ratio(self) -> float:
|
||||
if self._character_count == 0:
|
||||
return 0.0
|
||||
|
||||
return (self._successive_count * 2) / self._character_count
|
||||
|
||||
|
||||
class SuspiciousRange(MessDetectorPlugin):
|
||||
def __init__(self) -> None:
|
||||
self._suspicious_successive_range_count: int = 0
|
||||
self._character_count: int = 0
|
||||
self._last_printable_seen: Optional[str] = None
|
||||
|
||||
def eligible(self, character: str) -> bool:
|
||||
return character.isprintable()
|
||||
|
||||
def feed(self, character: str) -> None:
|
||||
self._character_count += 1
|
||||
|
||||
if (
|
||||
character.isspace()
|
||||
or is_punctuation(character)
|
||||
or character in COMMON_SAFE_ASCII_CHARACTERS
|
||||
):
|
||||
self._last_printable_seen = None
|
||||
return
|
||||
|
||||
if self._last_printable_seen is None:
|
||||
self._last_printable_seen = character
|
||||
return
|
||||
|
||||
unicode_range_a: Optional[str] = unicode_range(self._last_printable_seen)
|
||||
unicode_range_b: Optional[str] = unicode_range(character)
|
||||
|
||||
if is_suspiciously_successive_range(unicode_range_a, unicode_range_b):
|
||||
self._suspicious_successive_range_count += 1
|
||||
|
||||
self._last_printable_seen = character
|
||||
|
||||
def reset(self) -> None: # pragma: no cover
|
||||
self._character_count = 0
|
||||
self._suspicious_successive_range_count = 0
|
||||
self._last_printable_seen = None
|
||||
|
||||
@property
|
||||
def ratio(self) -> float:
|
||||
if self._character_count == 0:
|
||||
return 0.0
|
||||
|
||||
ratio_of_suspicious_range_usage: float = (
|
||||
self._suspicious_successive_range_count * 2
|
||||
) / self._character_count
|
||||
|
||||
if ratio_of_suspicious_range_usage < 0.1:
|
||||
return 0.0
|
||||
|
||||
return ratio_of_suspicious_range_usage
|
||||
|
||||
|
||||
class SuperWeirdWordPlugin(MessDetectorPlugin):
|
||||
def __init__(self) -> None:
|
||||
self._word_count: int = 0
|
||||
self._bad_word_count: int = 0
|
||||
self._foreign_long_count: int = 0
|
||||
|
||||
self._is_current_word_bad: bool = False
|
||||
self._foreign_long_watch: bool = False
|
||||
|
||||
self._character_count: int = 0
|
||||
self._bad_character_count: int = 0
|
||||
|
||||
self._buffer: str = ""
|
||||
self._buffer_accent_count: int = 0
|
||||
|
||||
def eligible(self, character: str) -> bool:
|
||||
return True
|
||||
|
||||
def feed(self, character: str) -> None:
|
||||
if character.isalpha():
|
||||
self._buffer += character
|
||||
if is_accentuated(character):
|
||||
self._buffer_accent_count += 1
|
||||
if (
|
||||
self._foreign_long_watch is False
|
||||
and (is_latin(character) is False or is_accentuated(character))
|
||||
and is_cjk(character) is False
|
||||
and is_hangul(character) is False
|
||||
and is_katakana(character) is False
|
||||
and is_hiragana(character) is False
|
||||
and is_thai(character) is False
|
||||
):
|
||||
self._foreign_long_watch = True
|
||||
return
|
||||
if not self._buffer:
|
||||
return
|
||||
if (
|
||||
character.isspace() or is_punctuation(character) or is_separator(character)
|
||||
) and self._buffer:
|
||||
self._word_count += 1
|
||||
buffer_length: int = len(self._buffer)
|
||||
|
||||
self._character_count += buffer_length
|
||||
|
||||
if buffer_length >= 4:
|
||||
if self._buffer_accent_count / buffer_length > 0.34:
|
||||
self._is_current_word_bad = True
|
||||
# Word/Buffer ending with a upper case accentuated letter are so rare,
|
||||
# that we will consider them all as suspicious. Same weight as foreign_long suspicious.
|
||||
if is_accentuated(self._buffer[-1]) and self._buffer[-1].isupper():
|
||||
self._foreign_long_count += 1
|
||||
self._is_current_word_bad = True
|
||||
if buffer_length >= 24 and self._foreign_long_watch:
|
||||
self._foreign_long_count += 1
|
||||
self._is_current_word_bad = True
|
||||
|
||||
if self._is_current_word_bad:
|
||||
self._bad_word_count += 1
|
||||
self._bad_character_count += len(self._buffer)
|
||||
self._is_current_word_bad = False
|
||||
|
||||
self._foreign_long_watch = False
|
||||
self._buffer = ""
|
||||
self._buffer_accent_count = 0
|
||||
elif (
|
||||
character not in {"<", ">", "-", "=", "~", "|", "_"}
|
||||
and character.isdigit() is False
|
||||
and is_symbol(character)
|
||||
):
|
||||
self._is_current_word_bad = True
|
||||
self._buffer += character
|
||||
|
||||
def reset(self) -> None: # pragma: no cover
|
||||
self._buffer = ""
|
||||
self._is_current_word_bad = False
|
||||
self._foreign_long_watch = False
|
||||
self._bad_word_count = 0
|
||||
self._word_count = 0
|
||||
self._character_count = 0
|
||||
self._bad_character_count = 0
|
||||
self._foreign_long_count = 0
|
||||
|
||||
@property
|
||||
def ratio(self) -> float:
|
||||
if self._word_count <= 10 and self._foreign_long_count == 0:
|
||||
return 0.0
|
||||
|
||||
return self._bad_character_count / self._character_count
|
||||
|
||||
|
||||
class CjkInvalidStopPlugin(MessDetectorPlugin):
|
||||
"""
|
||||
GB(Chinese) based encoding often render the stop incorrectly when the content does not fit and
|
||||
can be easily detected. Searching for the overuse of '丅' and '丄'.
|
||||
"""
|
||||
|
||||
def __init__(self) -> None:
|
||||
self._wrong_stop_count: int = 0
|
||||
self._cjk_character_count: int = 0
|
||||
|
||||
def eligible(self, character: str) -> bool:
|
||||
return True
|
||||
|
||||
def feed(self, character: str) -> None:
|
||||
if character in {"丅", "丄"}:
|
||||
self._wrong_stop_count += 1
|
||||
return
|
||||
if is_cjk(character):
|
||||
self._cjk_character_count += 1
|
||||
|
||||
def reset(self) -> None: # pragma: no cover
|
||||
self._wrong_stop_count = 0
|
||||
self._cjk_character_count = 0
|
||||
|
||||
@property
|
||||
def ratio(self) -> float:
|
||||
if self._cjk_character_count < 16:
|
||||
return 0.0
|
||||
return self._wrong_stop_count / self._cjk_character_count
|
||||
|
||||
|
||||
class ArchaicUpperLowerPlugin(MessDetectorPlugin):
|
||||
def __init__(self) -> None:
|
||||
self._buf: bool = False
|
||||
|
||||
self._character_count_since_last_sep: int = 0
|
||||
|
||||
self._successive_upper_lower_count: int = 0
|
||||
self._successive_upper_lower_count_final: int = 0
|
||||
|
||||
self._character_count: int = 0
|
||||
|
||||
self._last_alpha_seen: Optional[str] = None
|
||||
self._current_ascii_only: bool = True
|
||||
|
||||
def eligible(self, character: str) -> bool:
|
||||
return True
|
||||
|
||||
def feed(self, character: str) -> None:
|
||||
is_concerned = character.isalpha() and is_case_variable(character)
|
||||
chunk_sep = is_concerned is False
|
||||
|
||||
if chunk_sep and self._character_count_since_last_sep > 0:
|
||||
if (
|
||||
self._character_count_since_last_sep <= 64
|
||||
and character.isdigit() is False
|
||||
and self._current_ascii_only is False
|
||||
):
|
||||
self._successive_upper_lower_count_final += (
|
||||
self._successive_upper_lower_count
|
||||
)
|
||||
|
||||
self._successive_upper_lower_count = 0
|
||||
self._character_count_since_last_sep = 0
|
||||
self._last_alpha_seen = None
|
||||
self._buf = False
|
||||
self._character_count += 1
|
||||
self._current_ascii_only = True
|
||||
|
||||
return
|
||||
|
||||
if self._current_ascii_only is True and is_ascii(character) is False:
|
||||
self._current_ascii_only = False
|
||||
|
||||
if self._last_alpha_seen is not None:
|
||||
if (character.isupper() and self._last_alpha_seen.islower()) or (
|
||||
character.islower() and self._last_alpha_seen.isupper()
|
||||
):
|
||||
if self._buf is True:
|
||||
self._successive_upper_lower_count += 2
|
||||
self._buf = False
|
||||
else:
|
||||
self._buf = True
|
||||
else:
|
||||
self._buf = False
|
||||
|
||||
self._character_count += 1
|
||||
self._character_count_since_last_sep += 1
|
||||
self._last_alpha_seen = character
|
||||
|
||||
def reset(self) -> None: # pragma: no cover
|
||||
self._character_count = 0
|
||||
self._character_count_since_last_sep = 0
|
||||
self._successive_upper_lower_count = 0
|
||||
self._successive_upper_lower_count_final = 0
|
||||
self._last_alpha_seen = None
|
||||
self._buf = False
|
||||
self._current_ascii_only = True
|
||||
|
||||
@property
|
||||
def ratio(self) -> float:
|
||||
if self._character_count == 0:
|
||||
return 0.0
|
||||
|
||||
return self._successive_upper_lower_count_final / self._character_count
|
||||
|
||||
|
||||
@lru_cache(maxsize=1024)
|
||||
def is_suspiciously_successive_range(
|
||||
unicode_range_a: Optional[str], unicode_range_b: Optional[str]
|
||||
) -> bool:
|
||||
"""
|
||||
Determine if two Unicode range seen next to each other can be considered as suspicious.
|
||||
"""
|
||||
if unicode_range_a is None or unicode_range_b is None:
|
||||
return True
|
||||
|
||||
if unicode_range_a == unicode_range_b:
|
||||
return False
|
||||
|
||||
if "Latin" in unicode_range_a and "Latin" in unicode_range_b:
|
||||
return False
|
||||
|
||||
if "Emoticons" in unicode_range_a or "Emoticons" in unicode_range_b:
|
||||
return False
|
||||
|
||||
# Latin characters can be accompanied with a combining diacritical mark
|
||||
# eg. Vietnamese.
|
||||
if ("Latin" in unicode_range_a or "Latin" in unicode_range_b) and (
|
||||
"Combining" in unicode_range_a or "Combining" in unicode_range_b
|
||||
):
|
||||
return False
|
||||
|
||||
keywords_range_a, keywords_range_b = unicode_range_a.split(
|
||||
" "
|
||||
), unicode_range_b.split(" ")
|
||||
|
||||
for el in keywords_range_a:
|
||||
if el in UNICODE_SECONDARY_RANGE_KEYWORD:
|
||||
continue
|
||||
if el in keywords_range_b:
|
||||
return False
|
||||
|
||||
# Japanese Exception
|
||||
range_a_jp_chars, range_b_jp_chars = (
|
||||
unicode_range_a
|
||||
in (
|
||||
"Hiragana",
|
||||
"Katakana",
|
||||
),
|
||||
unicode_range_b in ("Hiragana", "Katakana"),
|
||||
)
|
||||
if (range_a_jp_chars or range_b_jp_chars) and (
|
||||
"CJK" in unicode_range_a or "CJK" in unicode_range_b
|
||||
):
|
||||
return False
|
||||
if range_a_jp_chars and range_b_jp_chars:
|
||||
return False
|
||||
|
||||
if "Hangul" in unicode_range_a or "Hangul" in unicode_range_b:
|
||||
if "CJK" in unicode_range_a or "CJK" in unicode_range_b:
|
||||
return False
|
||||
if unicode_range_a == "Basic Latin" or unicode_range_b == "Basic Latin":
|
||||
return False
|
||||
|
||||
# Chinese/Japanese use dedicated range for punctuation and/or separators.
|
||||
if ("CJK" in unicode_range_a or "CJK" in unicode_range_b) or (
|
||||
unicode_range_a in ["Katakana", "Hiragana"]
|
||||
and unicode_range_b in ["Katakana", "Hiragana"]
|
||||
):
|
||||
if "Punctuation" in unicode_range_a or "Punctuation" in unicode_range_b:
|
||||
return False
|
||||
if "Forms" in unicode_range_a or "Forms" in unicode_range_b:
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
|
||||
@lru_cache(maxsize=2048)
|
||||
def mess_ratio(
|
||||
decoded_sequence: str, maximum_threshold: float = 0.2, debug: bool = False
|
||||
) -> float:
|
||||
"""
|
||||
Compute a mess ratio given a decoded bytes sequence. The maximum threshold does stop the computation earlier.
|
||||
"""
|
||||
|
||||
detectors: List[MessDetectorPlugin] = [
|
||||
md_class() for md_class in MessDetectorPlugin.__subclasses__()
|
||||
]
|
||||
|
||||
length: int = len(decoded_sequence) + 1
|
||||
|
||||
mean_mess_ratio: float = 0.0
|
||||
|
||||
if length < 512:
|
||||
intermediary_mean_mess_ratio_calc: int = 32
|
||||
elif length <= 1024:
|
||||
intermediary_mean_mess_ratio_calc = 64
|
||||
else:
|
||||
intermediary_mean_mess_ratio_calc = 128
|
||||
|
||||
for character, index in zip(decoded_sequence + "\n", range(length)):
|
||||
for detector in detectors:
|
||||
if detector.eligible(character):
|
||||
detector.feed(character)
|
||||
|
||||
if (
|
||||
index > 0 and index % intermediary_mean_mess_ratio_calc == 0
|
||||
) or index == length - 1:
|
||||
mean_mess_ratio = sum(dt.ratio for dt in detectors)
|
||||
|
||||
if mean_mess_ratio >= maximum_threshold:
|
||||
break
|
||||
|
||||
if debug:
|
||||
for dt in detectors: # pragma: nocover
|
||||
print(dt.__class__, dt.ratio)
|
||||
|
||||
return round(mean_mess_ratio, 3)
|
||||
401
_vendor/charset_normalizer/models.py
Normal file
401
_vendor/charset_normalizer/models.py
Normal file
@@ -0,0 +1,401 @@
|
||||
import warnings
|
||||
from collections import Counter
|
||||
from encodings.aliases import aliases
|
||||
from hashlib import sha256
|
||||
from json import dumps
|
||||
from re import sub
|
||||
from typing import (
|
||||
Any,
|
||||
Counter as TypeCounter,
|
||||
Dict,
|
||||
Iterator,
|
||||
List,
|
||||
Optional,
|
||||
Tuple,
|
||||
Union,
|
||||
)
|
||||
|
||||
from .constant import NOT_PRINTABLE_PATTERN, TOO_BIG_SEQUENCE
|
||||
from .md import mess_ratio
|
||||
from .utils import iana_name, is_multi_byte_encoding, unicode_range
|
||||
|
||||
|
||||
class CharsetMatch:
|
||||
def __init__(
|
||||
self,
|
||||
payload: bytes,
|
||||
guessed_encoding: str,
|
||||
mean_mess_ratio: float,
|
||||
has_sig_or_bom: bool,
|
||||
languages: "CoherenceMatches",
|
||||
decoded_payload: Optional[str] = None,
|
||||
):
|
||||
self._payload: bytes = payload
|
||||
|
||||
self._encoding: str = guessed_encoding
|
||||
self._mean_mess_ratio: float = mean_mess_ratio
|
||||
self._languages: CoherenceMatches = languages
|
||||
self._has_sig_or_bom: bool = has_sig_or_bom
|
||||
self._unicode_ranges: Optional[List[str]] = None
|
||||
|
||||
self._leaves: List[CharsetMatch] = []
|
||||
self._mean_coherence_ratio: float = 0.0
|
||||
|
||||
self._output_payload: Optional[bytes] = None
|
||||
self._output_encoding: Optional[str] = None
|
||||
|
||||
self._string: Optional[str] = decoded_payload
|
||||
|
||||
def __eq__(self, other: object) -> bool:
|
||||
if not isinstance(other, CharsetMatch):
|
||||
raise TypeError(
|
||||
"__eq__ cannot be invoked on {} and {}.".format(
|
||||
str(other.__class__), str(self.__class__)
|
||||
)
|
||||
)
|
||||
return self.encoding == other.encoding and self.fingerprint == other.fingerprint
|
||||
|
||||
def __lt__(self, other: object) -> bool:
|
||||
"""
|
||||
Implemented to make sorted available upon CharsetMatches items.
|
||||
"""
|
||||
if not isinstance(other, CharsetMatch):
|
||||
raise ValueError
|
||||
|
||||
chaos_difference: float = abs(self.chaos - other.chaos)
|
||||
coherence_difference: float = abs(self.coherence - other.coherence)
|
||||
|
||||
# Bellow 1% difference --> Use Coherence
|
||||
if chaos_difference < 0.01 and coherence_difference > 0.02:
|
||||
# When having a tough decision, use the result that decoded as many multi-byte as possible.
|
||||
if chaos_difference == 0.0 and self.coherence == other.coherence:
|
||||
return self.multi_byte_usage > other.multi_byte_usage
|
||||
return self.coherence > other.coherence
|
||||
|
||||
return self.chaos < other.chaos
|
||||
|
||||
@property
|
||||
def multi_byte_usage(self) -> float:
|
||||
return 1.0 - len(str(self)) / len(self.raw)
|
||||
|
||||
@property
|
||||
def chaos_secondary_pass(self) -> float:
|
||||
"""
|
||||
Check once again chaos in decoded text, except this time, with full content.
|
||||
Use with caution, this can be very slow.
|
||||
Notice: Will be removed in 3.0
|
||||
"""
|
||||
warnings.warn(
|
||||
"chaos_secondary_pass is deprecated and will be removed in 3.0",
|
||||
DeprecationWarning,
|
||||
)
|
||||
return mess_ratio(str(self), 1.0)
|
||||
|
||||
@property
|
||||
def coherence_non_latin(self) -> float:
|
||||
"""
|
||||
Coherence ratio on the first non-latin language detected if ANY.
|
||||
Notice: Will be removed in 3.0
|
||||
"""
|
||||
warnings.warn(
|
||||
"coherence_non_latin is deprecated and will be removed in 3.0",
|
||||
DeprecationWarning,
|
||||
)
|
||||
return 0.0
|
||||
|
||||
@property
|
||||
def w_counter(self) -> TypeCounter[str]:
|
||||
"""
|
||||
Word counter instance on decoded text.
|
||||
Notice: Will be removed in 3.0
|
||||
"""
|
||||
warnings.warn(
|
||||
"w_counter is deprecated and will be removed in 3.0", DeprecationWarning
|
||||
)
|
||||
|
||||
string_printable_only = sub(NOT_PRINTABLE_PATTERN, " ", str(self).lower())
|
||||
|
||||
return Counter(string_printable_only.split())
|
||||
|
||||
def __str__(self) -> str:
|
||||
# Lazy Str Loading
|
||||
if self._string is None:
|
||||
self._string = str(self._payload, self._encoding, "strict")
|
||||
return self._string
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return "<CharsetMatch '{}' bytes({})>".format(self.encoding, self.fingerprint)
|
||||
|
||||
def add_submatch(self, other: "CharsetMatch") -> None:
|
||||
if not isinstance(other, CharsetMatch) or other == self:
|
||||
raise ValueError(
|
||||
"Unable to add instance <{}> as a submatch of a CharsetMatch".format(
|
||||
other.__class__
|
||||
)
|
||||
)
|
||||
|
||||
other._string = None # Unload RAM usage; dirty trick.
|
||||
self._leaves.append(other)
|
||||
|
||||
@property
|
||||
def encoding(self) -> str:
|
||||
return self._encoding
|
||||
|
||||
@property
|
||||
def encoding_aliases(self) -> List[str]:
|
||||
"""
|
||||
Encoding name are known by many name, using this could help when searching for IBM855 when it's listed as CP855.
|
||||
"""
|
||||
also_known_as: List[str] = []
|
||||
for u, p in aliases.items():
|
||||
if self.encoding == u:
|
||||
also_known_as.append(p)
|
||||
elif self.encoding == p:
|
||||
also_known_as.append(u)
|
||||
return also_known_as
|
||||
|
||||
@property
|
||||
def bom(self) -> bool:
|
||||
return self._has_sig_or_bom
|
||||
|
||||
@property
|
||||
def byte_order_mark(self) -> bool:
|
||||
return self._has_sig_or_bom
|
||||
|
||||
@property
|
||||
def languages(self) -> List[str]:
|
||||
"""
|
||||
Return the complete list of possible languages found in decoded sequence.
|
||||
Usually not really useful. Returned list may be empty even if 'language' property return something != 'Unknown'.
|
||||
"""
|
||||
return [e[0] for e in self._languages]
|
||||
|
||||
@property
|
||||
def language(self) -> str:
|
||||
"""
|
||||
Most probable language found in decoded sequence. If none were detected or inferred, the property will return
|
||||
"Unknown".
|
||||
"""
|
||||
if not self._languages:
|
||||
# Trying to infer the language based on the given encoding
|
||||
# Its either English or we should not pronounce ourselves in certain cases.
|
||||
if "ascii" in self.could_be_from_charset:
|
||||
return "English"
|
||||
|
||||
# doing it there to avoid circular import
|
||||
from charset_normalizer.cd import encoding_languages, mb_encoding_languages
|
||||
|
||||
languages = (
|
||||
mb_encoding_languages(self.encoding)
|
||||
if is_multi_byte_encoding(self.encoding)
|
||||
else encoding_languages(self.encoding)
|
||||
)
|
||||
|
||||
if len(languages) == 0 or "Latin Based" in languages:
|
||||
return "Unknown"
|
||||
|
||||
return languages[0]
|
||||
|
||||
return self._languages[0][0]
|
||||
|
||||
@property
|
||||
def chaos(self) -> float:
|
||||
return self._mean_mess_ratio
|
||||
|
||||
@property
|
||||
def coherence(self) -> float:
|
||||
if not self._languages:
|
||||
return 0.0
|
||||
return self._languages[0][1]
|
||||
|
||||
@property
|
||||
def percent_chaos(self) -> float:
|
||||
return round(self.chaos * 100, ndigits=3)
|
||||
|
||||
@property
|
||||
def percent_coherence(self) -> float:
|
||||
return round(self.coherence * 100, ndigits=3)
|
||||
|
||||
@property
|
||||
def raw(self) -> bytes:
|
||||
"""
|
||||
Original untouched bytes.
|
||||
"""
|
||||
return self._payload
|
||||
|
||||
@property
|
||||
def submatch(self) -> List["CharsetMatch"]:
|
||||
return self._leaves
|
||||
|
||||
@property
|
||||
def has_submatch(self) -> bool:
|
||||
return len(self._leaves) > 0
|
||||
|
||||
@property
|
||||
def alphabets(self) -> List[str]:
|
||||
if self._unicode_ranges is not None:
|
||||
return self._unicode_ranges
|
||||
# list detected ranges
|
||||
detected_ranges: List[Optional[str]] = [
|
||||
unicode_range(char) for char in str(self)
|
||||
]
|
||||
# filter and sort
|
||||
self._unicode_ranges = sorted(list({r for r in detected_ranges if r}))
|
||||
return self._unicode_ranges
|
||||
|
||||
@property
|
||||
def could_be_from_charset(self) -> List[str]:
|
||||
"""
|
||||
The complete list of encoding that output the exact SAME str result and therefore could be the originating
|
||||
encoding.
|
||||
This list does include the encoding available in property 'encoding'.
|
||||
"""
|
||||
return [self._encoding] + [m.encoding for m in self._leaves]
|
||||
|
||||
def first(self) -> "CharsetMatch":
|
||||
"""
|
||||
Kept for BC reasons. Will be removed in 3.0.
|
||||
"""
|
||||
return self
|
||||
|
||||
def best(self) -> "CharsetMatch":
|
||||
"""
|
||||
Kept for BC reasons. Will be removed in 3.0.
|
||||
"""
|
||||
return self
|
||||
|
||||
def output(self, encoding: str = "utf_8") -> bytes:
|
||||
"""
|
||||
Method to get re-encoded bytes payload using given target encoding. Default to UTF-8.
|
||||
Any errors will be simply ignored by the encoder NOT replaced.
|
||||
"""
|
||||
if self._output_encoding is None or self._output_encoding != encoding:
|
||||
self._output_encoding = encoding
|
||||
self._output_payload = str(self).encode(encoding, "replace")
|
||||
|
||||
return self._output_payload # type: ignore
|
||||
|
||||
@property
|
||||
def fingerprint(self) -> str:
|
||||
"""
|
||||
Retrieve the unique SHA256 computed using the transformed (re-encoded) payload. Not the original one.
|
||||
"""
|
||||
return sha256(self.output()).hexdigest()
|
||||
|
||||
|
||||
class CharsetMatches:
|
||||
"""
|
||||
Container with every CharsetMatch items ordered by default from most probable to the less one.
|
||||
Act like a list(iterable) but does not implements all related methods.
|
||||
"""
|
||||
|
||||
def __init__(self, results: Optional[List[CharsetMatch]] = None):
|
||||
self._results: List[CharsetMatch] = sorted(results) if results else []
|
||||
|
||||
def __iter__(self) -> Iterator[CharsetMatch]:
|
||||
yield from self._results
|
||||
|
||||
def __getitem__(self, item: Union[int, str]) -> CharsetMatch:
|
||||
"""
|
||||
Retrieve a single item either by its position or encoding name (alias may be used here).
|
||||
Raise KeyError upon invalid index or encoding not present in results.
|
||||
"""
|
||||
if isinstance(item, int):
|
||||
return self._results[item]
|
||||
if isinstance(item, str):
|
||||
item = iana_name(item, False)
|
||||
for result in self._results:
|
||||
if item in result.could_be_from_charset:
|
||||
return result
|
||||
raise KeyError
|
||||
|
||||
def __len__(self) -> int:
|
||||
return len(self._results)
|
||||
|
||||
def __bool__(self) -> bool:
|
||||
return len(self._results) > 0
|
||||
|
||||
def append(self, item: CharsetMatch) -> None:
|
||||
"""
|
||||
Insert a single match. Will be inserted accordingly to preserve sort.
|
||||
Can be inserted as a submatch.
|
||||
"""
|
||||
if not isinstance(item, CharsetMatch):
|
||||
raise ValueError(
|
||||
"Cannot append instance '{}' to CharsetMatches".format(
|
||||
str(item.__class__)
|
||||
)
|
||||
)
|
||||
# We should disable the submatch factoring when the input file is too heavy (conserve RAM usage)
|
||||
if len(item.raw) <= TOO_BIG_SEQUENCE:
|
||||
for match in self._results:
|
||||
if match.fingerprint == item.fingerprint and match.chaos == item.chaos:
|
||||
match.add_submatch(item)
|
||||
return
|
||||
self._results.append(item)
|
||||
self._results = sorted(self._results)
|
||||
|
||||
def best(self) -> Optional["CharsetMatch"]:
|
||||
"""
|
||||
Simply return the first match. Strict equivalent to matches[0].
|
||||
"""
|
||||
if not self._results:
|
||||
return None
|
||||
return self._results[0]
|
||||
|
||||
def first(self) -> Optional["CharsetMatch"]:
|
||||
"""
|
||||
Redundant method, call the method best(). Kept for BC reasons.
|
||||
"""
|
||||
return self.best()
|
||||
|
||||
|
||||
CoherenceMatch = Tuple[str, float]
|
||||
CoherenceMatches = List[CoherenceMatch]
|
||||
|
||||
|
||||
class CliDetectionResult:
|
||||
def __init__(
|
||||
self,
|
||||
path: str,
|
||||
encoding: Optional[str],
|
||||
encoding_aliases: List[str],
|
||||
alternative_encodings: List[str],
|
||||
language: str,
|
||||
alphabets: List[str],
|
||||
has_sig_or_bom: bool,
|
||||
chaos: float,
|
||||
coherence: float,
|
||||
unicode_path: Optional[str],
|
||||
is_preferred: bool,
|
||||
):
|
||||
self.path: str = path
|
||||
self.unicode_path: Optional[str] = unicode_path
|
||||
self.encoding: Optional[str] = encoding
|
||||
self.encoding_aliases: List[str] = encoding_aliases
|
||||
self.alternative_encodings: List[str] = alternative_encodings
|
||||
self.language: str = language
|
||||
self.alphabets: List[str] = alphabets
|
||||
self.has_sig_or_bom: bool = has_sig_or_bom
|
||||
self.chaos: float = chaos
|
||||
self.coherence: float = coherence
|
||||
self.is_preferred: bool = is_preferred
|
||||
|
||||
@property
|
||||
def __dict__(self) -> Dict[str, Any]: # type: ignore
|
||||
return {
|
||||
"path": self.path,
|
||||
"encoding": self.encoding,
|
||||
"encoding_aliases": self.encoding_aliases,
|
||||
"alternative_encodings": self.alternative_encodings,
|
||||
"language": self.language,
|
||||
"alphabets": self.alphabets,
|
||||
"has_sig_or_bom": self.has_sig_or_bom,
|
||||
"chaos": self.chaos,
|
||||
"coherence": self.coherence,
|
||||
"unicode_path": self.unicode_path,
|
||||
"is_preferred": self.is_preferred,
|
||||
}
|
||||
|
||||
def to_json(self) -> str:
|
||||
return dumps(self.__dict__, ensure_ascii=True, indent=4)
|
||||
0
_vendor/charset_normalizer/py.typed
Normal file
0
_vendor/charset_normalizer/py.typed
Normal file
424
_vendor/charset_normalizer/utils.py
Normal file
424
_vendor/charset_normalizer/utils.py
Normal file
@@ -0,0 +1,424 @@
|
||||
try:
|
||||
# WARNING: unicodedata2 support is going to be removed in 3.0
|
||||
# Python is quickly catching up.
|
||||
import unicodedata2 as unicodedata
|
||||
except ImportError:
|
||||
import unicodedata # type: ignore[no-redef]
|
||||
|
||||
import importlib
|
||||
import logging
|
||||
from codecs import IncrementalDecoder
|
||||
from encodings.aliases import aliases
|
||||
from functools import lru_cache
|
||||
from re import findall
|
||||
from typing import Generator, List, Optional, Set, Tuple, Union
|
||||
|
||||
from _multibytecodec import MultibyteIncrementalDecoder
|
||||
|
||||
from .constant import (
|
||||
ENCODING_MARKS,
|
||||
IANA_SUPPORTED_SIMILAR,
|
||||
RE_POSSIBLE_ENCODING_INDICATION,
|
||||
UNICODE_RANGES_COMBINED,
|
||||
UNICODE_SECONDARY_RANGE_KEYWORD,
|
||||
UTF8_MAXIMAL_ALLOCATION,
|
||||
)
|
||||
|
||||
|
||||
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
||||
def is_accentuated(character: str) -> bool:
|
||||
try:
|
||||
description: str = unicodedata.name(character)
|
||||
except ValueError:
|
||||
return False
|
||||
return (
|
||||
"WITH GRAVE" in description
|
||||
or "WITH ACUTE" in description
|
||||
or "WITH CEDILLA" in description
|
||||
or "WITH DIAERESIS" in description
|
||||
or "WITH CIRCUMFLEX" in description
|
||||
or "WITH TILDE" in description
|
||||
)
|
||||
|
||||
|
||||
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
||||
def remove_accent(character: str) -> str:
|
||||
decomposed: str = unicodedata.decomposition(character)
|
||||
if not decomposed:
|
||||
return character
|
||||
|
||||
codes: List[str] = decomposed.split(" ")
|
||||
|
||||
return chr(int(codes[0], 16))
|
||||
|
||||
|
||||
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
||||
def unicode_range(character: str) -> Optional[str]:
|
||||
"""
|
||||
Retrieve the Unicode range official name from a single character.
|
||||
"""
|
||||
character_ord: int = ord(character)
|
||||
|
||||
for range_name, ord_range in UNICODE_RANGES_COMBINED.items():
|
||||
if character_ord in ord_range:
|
||||
return range_name
|
||||
|
||||
return None
|
||||
|
||||
|
||||
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
||||
def is_latin(character: str) -> bool:
|
||||
try:
|
||||
description: str = unicodedata.name(character)
|
||||
except ValueError:
|
||||
return False
|
||||
return "LATIN" in description
|
||||
|
||||
|
||||
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
||||
def is_ascii(character: str) -> bool:
|
||||
try:
|
||||
character.encode("ascii")
|
||||
except UnicodeEncodeError:
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
||||
def is_punctuation(character: str) -> bool:
|
||||
character_category: str = unicodedata.category(character)
|
||||
|
||||
if "P" in character_category:
|
||||
return True
|
||||
|
||||
character_range: Optional[str] = unicode_range(character)
|
||||
|
||||
if character_range is None:
|
||||
return False
|
||||
|
||||
return "Punctuation" in character_range
|
||||
|
||||
|
||||
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
||||
def is_symbol(character: str) -> bool:
|
||||
character_category: str = unicodedata.category(character)
|
||||
|
||||
if "S" in character_category or "N" in character_category:
|
||||
return True
|
||||
|
||||
character_range: Optional[str] = unicode_range(character)
|
||||
|
||||
if character_range is None:
|
||||
return False
|
||||
|
||||
return "Forms" in character_range
|
||||
|
||||
|
||||
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
||||
def is_emoticon(character: str) -> bool:
|
||||
character_range: Optional[str] = unicode_range(character)
|
||||
|
||||
if character_range is None:
|
||||
return False
|
||||
|
||||
return "Emoticons" in character_range
|
||||
|
||||
|
||||
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
||||
def is_separator(character: str) -> bool:
|
||||
if character.isspace() or character in {"|", "+", ",", ";", "<", ">"}:
|
||||
return True
|
||||
|
||||
character_category: str = unicodedata.category(character)
|
||||
|
||||
return "Z" in character_category
|
||||
|
||||
|
||||
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
||||
def is_case_variable(character: str) -> bool:
|
||||
return character.islower() != character.isupper()
|
||||
|
||||
|
||||
def is_private_use_only(character: str) -> bool:
|
||||
character_category: str = unicodedata.category(character)
|
||||
|
||||
return character_category == "Co"
|
||||
|
||||
|
||||
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
||||
def is_cjk(character: str) -> bool:
|
||||
try:
|
||||
character_name = unicodedata.name(character)
|
||||
except ValueError:
|
||||
return False
|
||||
|
||||
return "CJK" in character_name
|
||||
|
||||
|
||||
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
||||
def is_hiragana(character: str) -> bool:
|
||||
try:
|
||||
character_name = unicodedata.name(character)
|
||||
except ValueError:
|
||||
return False
|
||||
|
||||
return "HIRAGANA" in character_name
|
||||
|
||||
|
||||
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
||||
def is_katakana(character: str) -> bool:
|
||||
try:
|
||||
character_name = unicodedata.name(character)
|
||||
except ValueError:
|
||||
return False
|
||||
|
||||
return "KATAKANA" in character_name
|
||||
|
||||
|
||||
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
||||
def is_hangul(character: str) -> bool:
|
||||
try:
|
||||
character_name = unicodedata.name(character)
|
||||
except ValueError:
|
||||
return False
|
||||
|
||||
return "HANGUL" in character_name
|
||||
|
||||
|
||||
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
||||
def is_thai(character: str) -> bool:
|
||||
try:
|
||||
character_name = unicodedata.name(character)
|
||||
except ValueError:
|
||||
return False
|
||||
|
||||
return "THAI" in character_name
|
||||
|
||||
|
||||
@lru_cache(maxsize=len(UNICODE_RANGES_COMBINED))
|
||||
def is_unicode_range_secondary(range_name: str) -> bool:
|
||||
return any(keyword in range_name for keyword in UNICODE_SECONDARY_RANGE_KEYWORD)
|
||||
|
||||
|
||||
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
||||
def is_unprintable(character: str) -> bool:
|
||||
return (
|
||||
character.isspace() is False # includes \n \t \r \v
|
||||
and character.isprintable() is False
|
||||
and character != "\x1A" # Why? Its the ASCII substitute character.
|
||||
and character != "\ufeff" # bug discovered in Python,
|
||||
# Zero Width No-Break Space located in Arabic Presentation Forms-B, Unicode 1.1 not acknowledged as space.
|
||||
)
|
||||
|
||||
|
||||
def any_specified_encoding(sequence: bytes, search_zone: int = 4096) -> Optional[str]:
|
||||
"""
|
||||
Extract using ASCII-only decoder any specified encoding in the first n-bytes.
|
||||
"""
|
||||
if not isinstance(sequence, bytes):
|
||||
raise TypeError
|
||||
|
||||
seq_len: int = len(sequence)
|
||||
|
||||
results: List[str] = findall(
|
||||
RE_POSSIBLE_ENCODING_INDICATION,
|
||||
sequence[: min(seq_len, search_zone)].decode("ascii", errors="ignore"),
|
||||
)
|
||||
|
||||
if len(results) == 0:
|
||||
return None
|
||||
|
||||
for specified_encoding in results:
|
||||
specified_encoding = specified_encoding.lower().replace("-", "_")
|
||||
|
||||
encoding_alias: str
|
||||
encoding_iana: str
|
||||
|
||||
for encoding_alias, encoding_iana in aliases.items():
|
||||
if encoding_alias == specified_encoding:
|
||||
return encoding_iana
|
||||
if encoding_iana == specified_encoding:
|
||||
return encoding_iana
|
||||
|
||||
return None
|
||||
|
||||
|
||||
@lru_cache(maxsize=128)
|
||||
def is_multi_byte_encoding(name: str) -> bool:
|
||||
"""
|
||||
Verify is a specific encoding is a multi byte one based on it IANA name
|
||||
"""
|
||||
return name in {
|
||||
"utf_8",
|
||||
"utf_8_sig",
|
||||
"utf_16",
|
||||
"utf_16_be",
|
||||
"utf_16_le",
|
||||
"utf_32",
|
||||
"utf_32_le",
|
||||
"utf_32_be",
|
||||
"utf_7",
|
||||
} or issubclass(
|
||||
importlib.import_module("encodings.{}".format(name)).IncrementalDecoder,
|
||||
MultibyteIncrementalDecoder,
|
||||
)
|
||||
|
||||
|
||||
def identify_sig_or_bom(sequence: bytes) -> Tuple[Optional[str], bytes]:
|
||||
"""
|
||||
Identify and extract SIG/BOM in given sequence.
|
||||
"""
|
||||
|
||||
for iana_encoding in ENCODING_MARKS:
|
||||
marks: Union[bytes, List[bytes]] = ENCODING_MARKS[iana_encoding]
|
||||
|
||||
if isinstance(marks, bytes):
|
||||
marks = [marks]
|
||||
|
||||
for mark in marks:
|
||||
if sequence.startswith(mark):
|
||||
return iana_encoding, mark
|
||||
|
||||
return None, b""
|
||||
|
||||
|
||||
def should_strip_sig_or_bom(iana_encoding: str) -> bool:
|
||||
return iana_encoding not in {"utf_16", "utf_32"}
|
||||
|
||||
|
||||
def iana_name(cp_name: str, strict: bool = True) -> str:
|
||||
cp_name = cp_name.lower().replace("-", "_")
|
||||
|
||||
encoding_alias: str
|
||||
encoding_iana: str
|
||||
|
||||
for encoding_alias, encoding_iana in aliases.items():
|
||||
if cp_name in [encoding_alias, encoding_iana]:
|
||||
return encoding_iana
|
||||
|
||||
if strict:
|
||||
raise ValueError("Unable to retrieve IANA for '{}'".format(cp_name))
|
||||
|
||||
return cp_name
|
||||
|
||||
|
||||
def range_scan(decoded_sequence: str) -> List[str]:
|
||||
ranges: Set[str] = set()
|
||||
|
||||
for character in decoded_sequence:
|
||||
character_range: Optional[str] = unicode_range(character)
|
||||
|
||||
if character_range is None:
|
||||
continue
|
||||
|
||||
ranges.add(character_range)
|
||||
|
||||
return list(ranges)
|
||||
|
||||
|
||||
def cp_similarity(iana_name_a: str, iana_name_b: str) -> float:
|
||||
|
||||
if is_multi_byte_encoding(iana_name_a) or is_multi_byte_encoding(iana_name_b):
|
||||
return 0.0
|
||||
|
||||
decoder_a = importlib.import_module(
|
||||
"encodings.{}".format(iana_name_a)
|
||||
).IncrementalDecoder
|
||||
decoder_b = importlib.import_module(
|
||||
"encodings.{}".format(iana_name_b)
|
||||
).IncrementalDecoder
|
||||
|
||||
id_a: IncrementalDecoder = decoder_a(errors="ignore")
|
||||
id_b: IncrementalDecoder = decoder_b(errors="ignore")
|
||||
|
||||
character_match_count: int = 0
|
||||
|
||||
for i in range(255):
|
||||
to_be_decoded: bytes = bytes([i])
|
||||
if id_a.decode(to_be_decoded) == id_b.decode(to_be_decoded):
|
||||
character_match_count += 1
|
||||
|
||||
return character_match_count / 254
|
||||
|
||||
|
||||
def is_cp_similar(iana_name_a: str, iana_name_b: str) -> bool:
|
||||
"""
|
||||
Determine if two code page are at least 80% similar. IANA_SUPPORTED_SIMILAR dict was generated using
|
||||
the function cp_similarity.
|
||||
"""
|
||||
return (
|
||||
iana_name_a in IANA_SUPPORTED_SIMILAR
|
||||
and iana_name_b in IANA_SUPPORTED_SIMILAR[iana_name_a]
|
||||
)
|
||||
|
||||
|
||||
def set_logging_handler(
|
||||
name: str = "charset_normalizer",
|
||||
level: int = logging.INFO,
|
||||
format_string: str = "%(asctime)s | %(levelname)s | %(message)s",
|
||||
) -> None:
|
||||
|
||||
logger = logging.getLogger(name)
|
||||
logger.setLevel(level)
|
||||
|
||||
handler = logging.StreamHandler()
|
||||
handler.setFormatter(logging.Formatter(format_string))
|
||||
logger.addHandler(handler)
|
||||
|
||||
|
||||
def cut_sequence_chunks(
|
||||
sequences: bytes,
|
||||
encoding_iana: str,
|
||||
offsets: range,
|
||||
chunk_size: int,
|
||||
bom_or_sig_available: bool,
|
||||
strip_sig_or_bom: bool,
|
||||
sig_payload: bytes,
|
||||
is_multi_byte_decoder: bool,
|
||||
decoded_payload: Optional[str] = None,
|
||||
) -> Generator[str, None, None]:
|
||||
|
||||
if decoded_payload and is_multi_byte_decoder is False:
|
||||
for i in offsets:
|
||||
chunk = decoded_payload[i : i + chunk_size]
|
||||
if not chunk:
|
||||
break
|
||||
yield chunk
|
||||
else:
|
||||
for i in offsets:
|
||||
chunk_end = i + chunk_size
|
||||
if chunk_end > len(sequences) + 8:
|
||||
continue
|
||||
|
||||
cut_sequence = sequences[i : i + chunk_size]
|
||||
|
||||
if bom_or_sig_available and strip_sig_or_bom is False:
|
||||
cut_sequence = sig_payload + cut_sequence
|
||||
|
||||
chunk = cut_sequence.decode(
|
||||
encoding_iana,
|
||||
errors="ignore" if is_multi_byte_decoder else "strict",
|
||||
)
|
||||
|
||||
# multi-byte bad cutting detector and adjustment
|
||||
# not the cleanest way to perform that fix but clever enough for now.
|
||||
if is_multi_byte_decoder and i > 0 and sequences[i] >= 0x80:
|
||||
|
||||
chunk_partial_size_chk: int = min(chunk_size, 16)
|
||||
|
||||
if (
|
||||
decoded_payload
|
||||
and chunk[:chunk_partial_size_chk] not in decoded_payload
|
||||
):
|
||||
for j in range(i, i - 4, -1):
|
||||
cut_sequence = sequences[j:chunk_end]
|
||||
|
||||
if bom_or_sig_available and strip_sig_or_bom is False:
|
||||
cut_sequence = sig_payload + cut_sequence
|
||||
|
||||
chunk = cut_sequence.decode(encoding_iana, errors="ignore")
|
||||
|
||||
if chunk[:chunk_partial_size_chk] in decoded_payload:
|
||||
break
|
||||
|
||||
yield chunk
|
||||
6
_vendor/charset_normalizer/version.py
Normal file
6
_vendor/charset_normalizer/version.py
Normal file
@@ -0,0 +1,6 @@
|
||||
"""
|
||||
Expose version
|
||||
"""
|
||||
|
||||
__version__ = "2.1.1"
|
||||
VERSION = __version__.split(".")
|
||||
1
_vendor/idna-3.4.dist-info/INSTALLER
Normal file
1
_vendor/idna-3.4.dist-info/INSTALLER
Normal file
@@ -0,0 +1 @@
|
||||
pip
|
||||
29
_vendor/idna-3.4.dist-info/LICENSE.md
Normal file
29
_vendor/idna-3.4.dist-info/LICENSE.md
Normal file
@@ -0,0 +1,29 @@
|
||||
BSD 3-Clause License
|
||||
|
||||
Copyright (c) 2013-2021, Kim Davies
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
|
||||
1. Redistributions of source code must retain the above copyright notice, this
|
||||
list of conditions and the following disclaimer.
|
||||
|
||||
2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
this list of conditions and the following disclaimer in the documentation
|
||||
and/or other materials provided with the distribution.
|
||||
|
||||
3. Neither the name of the copyright holder nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
242
_vendor/idna-3.4.dist-info/METADATA
Normal file
242
_vendor/idna-3.4.dist-info/METADATA
Normal file
@@ -0,0 +1,242 @@
|
||||
Metadata-Version: 2.1
|
||||
Name: idna
|
||||
Version: 3.4
|
||||
Summary: Internationalized Domain Names in Applications (IDNA)
|
||||
Author-email: Kim Davies <kim@cynosure.com.au>
|
||||
Requires-Python: >=3.5
|
||||
Description-Content-Type: text/x-rst
|
||||
Classifier: Development Status :: 5 - Production/Stable
|
||||
Classifier: Intended Audience :: Developers
|
||||
Classifier: Intended Audience :: System Administrators
|
||||
Classifier: License :: OSI Approved :: BSD License
|
||||
Classifier: Operating System :: OS Independent
|
||||
Classifier: Programming Language :: Python
|
||||
Classifier: Programming Language :: Python :: 3
|
||||
Classifier: Programming Language :: Python :: 3 :: Only
|
||||
Classifier: Programming Language :: Python :: 3.5
|
||||
Classifier: Programming Language :: Python :: 3.6
|
||||
Classifier: Programming Language :: Python :: 3.7
|
||||
Classifier: Programming Language :: Python :: 3.8
|
||||
Classifier: Programming Language :: Python :: 3.9
|
||||
Classifier: Programming Language :: Python :: 3.10
|
||||
Classifier: Programming Language :: Python :: 3.11
|
||||
Classifier: Programming Language :: Python :: Implementation :: CPython
|
||||
Classifier: Programming Language :: Python :: Implementation :: PyPy
|
||||
Classifier: Topic :: Internet :: Name Service (DNS)
|
||||
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
||||
Classifier: Topic :: Utilities
|
||||
Project-URL: Changelog, https://github.com/kjd/idna/blob/master/HISTORY.rst
|
||||
Project-URL: Issue tracker, https://github.com/kjd/idna/issues
|
||||
Project-URL: Source, https://github.com/kjd/idna
|
||||
|
||||
Internationalized Domain Names in Applications (IDNA)
|
||||
=====================================================
|
||||
|
||||
Support for the Internationalized Domain Names in
|
||||
Applications (IDNA) protocol as specified in `RFC 5891
|
||||
<https://tools.ietf.org/html/rfc5891>`_. This is the latest version of
|
||||
the protocol and is sometimes referred to as “IDNA 2008”.
|
||||
|
||||
This library also provides support for Unicode Technical
|
||||
Standard 46, `Unicode IDNA Compatibility Processing
|
||||
<https://unicode.org/reports/tr46/>`_.
|
||||
|
||||
This acts as a suitable replacement for the “encodings.idna”
|
||||
module that comes with the Python standard library, but which
|
||||
only supports the older superseded IDNA specification (`RFC 3490
|
||||
<https://tools.ietf.org/html/rfc3490>`_).
|
||||
|
||||
Basic functions are simply executed:
|
||||
|
||||
.. code-block:: pycon
|
||||
|
||||
>>> import idna
|
||||
>>> idna.encode('ドメイン.テスト')
|
||||
b'xn--eckwd4c7c.xn--zckzah'
|
||||
>>> print(idna.decode('xn--eckwd4c7c.xn--zckzah'))
|
||||
ドメイン.テスト
|
||||
|
||||
|
||||
Installation
|
||||
------------
|
||||
|
||||
This package is available for installation from PyPI:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
$ python3 -m pip install idna
|
||||
|
||||
|
||||
Usage
|
||||
-----
|
||||
|
||||
For typical usage, the ``encode`` and ``decode`` functions will take a
|
||||
domain name argument and perform a conversion to A-labels or U-labels
|
||||
respectively.
|
||||
|
||||
.. code-block:: pycon
|
||||
|
||||
>>> import idna
|
||||
>>> idna.encode('ドメイン.テスト')
|
||||
b'xn--eckwd4c7c.xn--zckzah'
|
||||
>>> print(idna.decode('xn--eckwd4c7c.xn--zckzah'))
|
||||
ドメイン.テスト
|
||||
|
||||
You may use the codec encoding and decoding methods using the
|
||||
``idna.codec`` module:
|
||||
|
||||
.. code-block:: pycon
|
||||
|
||||
>>> import idna.codec
|
||||
>>> print('домен.испытание'.encode('idna'))
|
||||
b'xn--d1acufc.xn--80akhbyknj4f'
|
||||
>>> print(b'xn--d1acufc.xn--80akhbyknj4f'.decode('idna'))
|
||||
домен.испытание
|
||||
|
||||
Conversions can be applied at a per-label basis using the ``ulabel`` or
|
||||
``alabel`` functions if necessary:
|
||||
|
||||
.. code-block:: pycon
|
||||
|
||||
>>> idna.alabel('测试')
|
||||
b'xn--0zwm56d'
|
||||
|
||||
Compatibility Mapping (UTS #46)
|
||||
+++++++++++++++++++++++++++++++
|
||||
|
||||
As described in `RFC 5895 <https://tools.ietf.org/html/rfc5895>`_, the
|
||||
IDNA specification does not normalize input from different potential
|
||||
ways a user may input a domain name. This functionality, known as
|
||||
a “mapping”, is considered by the specification to be a local
|
||||
user-interface issue distinct from IDNA conversion functionality.
|
||||
|
||||
This library provides one such mapping, that was developed by the
|
||||
Unicode Consortium. Known as `Unicode IDNA Compatibility Processing
|
||||
<https://unicode.org/reports/tr46/>`_, it provides for both a regular
|
||||
mapping for typical applications, as well as a transitional mapping to
|
||||
help migrate from older IDNA 2003 applications.
|
||||
|
||||
For example, “Königsgäßchen” is not a permissible label as *LATIN
|
||||
CAPITAL LETTER K* is not allowed (nor are capital letters in general).
|
||||
UTS 46 will convert this into lower case prior to applying the IDNA
|
||||
conversion.
|
||||
|
||||
.. code-block:: pycon
|
||||
|
||||
>>> import idna
|
||||
>>> idna.encode('Königsgäßchen')
|
||||
...
|
||||
idna.core.InvalidCodepoint: Codepoint U+004B at position 1 of 'Königsgäßchen' not allowed
|
||||
>>> idna.encode('Königsgäßchen', uts46=True)
|
||||
b'xn--knigsgchen-b4a3dun'
|
||||
>>> print(idna.decode('xn--knigsgchen-b4a3dun'))
|
||||
königsgäßchen
|
||||
|
||||
Transitional processing provides conversions to help transition from
|
||||
the older 2003 standard to the current standard. For example, in the
|
||||
original IDNA specification, the *LATIN SMALL LETTER SHARP S* (ß) was
|
||||
converted into two *LATIN SMALL LETTER S* (ss), whereas in the current
|
||||
IDNA specification this conversion is not performed.
|
||||
|
||||
.. code-block:: pycon
|
||||
|
||||
>>> idna.encode('Königsgäßchen', uts46=True, transitional=True)
|
||||
'xn--knigsgsschen-lcb0w'
|
||||
|
||||
Implementors should use transitional processing with caution, only in
|
||||
rare cases where conversion from legacy labels to current labels must be
|
||||
performed (i.e. IDNA implementations that pre-date 2008). For typical
|
||||
applications that just need to convert labels, transitional processing
|
||||
is unlikely to be beneficial and could produce unexpected incompatible
|
||||
results.
|
||||
|
||||
``encodings.idna`` Compatibility
|
||||
++++++++++++++++++++++++++++++++
|
||||
|
||||
Function calls from the Python built-in ``encodings.idna`` module are
|
||||
mapped to their IDNA 2008 equivalents using the ``idna.compat`` module.
|
||||
Simply substitute the ``import`` clause in your code to refer to the new
|
||||
module name.
|
||||
|
||||
Exceptions
|
||||
----------
|
||||
|
||||
All errors raised during the conversion following the specification
|
||||
should raise an exception derived from the ``idna.IDNAError`` base
|
||||
class.
|
||||
|
||||
More specific exceptions that may be generated as ``idna.IDNABidiError``
|
||||
when the error reflects an illegal combination of left-to-right and
|
||||
right-to-left characters in a label; ``idna.InvalidCodepoint`` when
|
||||
a specific codepoint is an illegal character in an IDN label (i.e.
|
||||
INVALID); and ``idna.InvalidCodepointContext`` when the codepoint is
|
||||
illegal based on its positional context (i.e. it is CONTEXTO or CONTEXTJ
|
||||
but the contextual requirements are not satisfied.)
|
||||
|
||||
Building and Diagnostics
|
||||
------------------------
|
||||
|
||||
The IDNA and UTS 46 functionality relies upon pre-calculated lookup
|
||||
tables for performance. These tables are derived from computing against
|
||||
eligibility criteria in the respective standards. These tables are
|
||||
computed using the command-line script ``tools/idna-data``.
|
||||
|
||||
This tool will fetch relevant codepoint data from the Unicode repository
|
||||
and perform the required calculations to identify eligibility. There are
|
||||
three main modes:
|
||||
|
||||
* ``idna-data make-libdata``. Generates ``idnadata.py`` and
|
||||
``uts46data.py``, the pre-calculated lookup tables using for IDNA and
|
||||
UTS 46 conversions. Implementors who wish to track this library against
|
||||
a different Unicode version may use this tool to manually generate a
|
||||
different version of the ``idnadata.py`` and ``uts46data.py`` files.
|
||||
|
||||
* ``idna-data make-table``. Generate a table of the IDNA disposition
|
||||
(e.g. PVALID, CONTEXTJ, CONTEXTO) in the format found in Appendix
|
||||
B.1 of RFC 5892 and the pre-computed tables published by `IANA
|
||||
<https://www.iana.org/>`_.
|
||||
|
||||
* ``idna-data U+0061``. Prints debugging output on the various
|
||||
properties associated with an individual Unicode codepoint (in this
|
||||
case, U+0061), that are used to assess the IDNA and UTS 46 status of a
|
||||
codepoint. This is helpful in debugging or analysis.
|
||||
|
||||
The tool accepts a number of arguments, described using ``idna-data
|
||||
-h``. Most notably, the ``--version`` argument allows the specification
|
||||
of the version of Unicode to use in computing the table data. For
|
||||
example, ``idna-data --version 9.0.0 make-libdata`` will generate
|
||||
library data against Unicode 9.0.0.
|
||||
|
||||
|
||||
Additional Notes
|
||||
----------------
|
||||
|
||||
* **Packages**. The latest tagged release version is published in the
|
||||
`Python Package Index <https://pypi.org/project/idna/>`_.
|
||||
|
||||
* **Version support**. This library supports Python 3.5 and higher.
|
||||
As this library serves as a low-level toolkit for a variety of
|
||||
applications, many of which strive for broad compatibility with older
|
||||
Python versions, there is no rush to remove older intepreter support.
|
||||
Removing support for older versions should be well justified in that the
|
||||
maintenance burden has become too high.
|
||||
|
||||
* **Python 2**. Python 2 is supported by version 2.x of this library.
|
||||
While active development of the version 2.x series has ended, notable
|
||||
issues being corrected may be backported to 2.x. Use "idna<3" in your
|
||||
requirements file if you need this library for a Python 2 application.
|
||||
|
||||
* **Testing**. The library has a test suite based on each rule of the
|
||||
IDNA specification, as well as tests that are provided as part of the
|
||||
Unicode Technical Standard 46, `Unicode IDNA Compatibility Processing
|
||||
<https://unicode.org/reports/tr46/>`_.
|
||||
|
||||
* **Emoji**. It is an occasional request to support emoji domains in
|
||||
this library. Encoding of symbols like emoji is expressly prohibited by
|
||||
the technical standard IDNA 2008 and emoji domains are broadly phased
|
||||
out across the domain industry due to associated security risks. For
|
||||
now, applications that wish need to support these non-compliant labels
|
||||
may wish to consider trying the encode/decode operation in this library
|
||||
first, and then falling back to using `encodings.idna`. See `the Github
|
||||
project <https://github.com/kjd/idna/issues/18>`_ for more discussion.
|
||||
|
||||
22
_vendor/idna-3.4.dist-info/RECORD
Normal file
22
_vendor/idna-3.4.dist-info/RECORD
Normal file
@@ -0,0 +1,22 @@
|
||||
idna-3.4.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
|
||||
idna-3.4.dist-info/LICENSE.md,sha256=otbk2UC9JNvnuWRc3hmpeSzFHbeuDVrNMBrIYMqj6DY,1523
|
||||
idna-3.4.dist-info/METADATA,sha256=8aLSf9MFS7oB26pZh2hprg7eJp0UJSc-3rpf_evp4DA,9830
|
||||
idna-3.4.dist-info/RECORD,,
|
||||
idna-3.4.dist-info/WHEEL,sha256=4TfKIB_xu-04bc2iKz6_zFt-gEFEEDU_31HGhqzOCE8,81
|
||||
idna/__init__.py,sha256=KJQN1eQBr8iIK5SKrJ47lXvxG0BJ7Lm38W4zT0v_8lk,849
|
||||
idna/__pycache__/__init__.cpython-310.pyc,,
|
||||
idna/__pycache__/codec.cpython-310.pyc,,
|
||||
idna/__pycache__/compat.cpython-310.pyc,,
|
||||
idna/__pycache__/core.cpython-310.pyc,,
|
||||
idna/__pycache__/idnadata.cpython-310.pyc,,
|
||||
idna/__pycache__/intranges.cpython-310.pyc,,
|
||||
idna/__pycache__/package_data.cpython-310.pyc,,
|
||||
idna/__pycache__/uts46data.cpython-310.pyc,,
|
||||
idna/codec.py,sha256=6ly5odKfqrytKT9_7UrlGklHnf1DSK2r9C6cSM4sa28,3374
|
||||
idna/compat.py,sha256=0_sOEUMT4CVw9doD3vyRhX80X19PwqFoUBs7gWsFME4,321
|
||||
idna/core.py,sha256=1JxchwKzkxBSn7R_oCE12oBu3eVux0VzdxolmIad24M,12950
|
||||
idna/idnadata.py,sha256=xUjqKqiJV8Ho_XzBpAtv5JFoVPSupK-SUXvtjygUHqw,44375
|
||||
idna/intranges.py,sha256=YBr4fRYuWH7kTKS2tXlFjM24ZF1Pdvcir-aywniInqg,1881
|
||||
idna/package_data.py,sha256=C_jHJzmX8PI4xq0jpzmcTMxpb5lDsq4o5VyxQzlVrZE,21
|
||||
idna/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
||||
idna/uts46data.py,sha256=zvjZU24s58_uAS850Mcd0NnD0X7_gCMAMjzWNIeUJdc,206539
|
||||
4
_vendor/idna-3.4.dist-info/WHEEL
Normal file
4
_vendor/idna-3.4.dist-info/WHEEL
Normal file
@@ -0,0 +1,4 @@
|
||||
Wheel-Version: 1.0
|
||||
Generator: flit 3.7.1
|
||||
Root-Is-Purelib: true
|
||||
Tag: py3-none-any
|
||||
44
_vendor/idna/__init__.py
Normal file
44
_vendor/idna/__init__.py
Normal file
@@ -0,0 +1,44 @@
|
||||
from .package_data import __version__
|
||||
from .core import (
|
||||
IDNABidiError,
|
||||
IDNAError,
|
||||
InvalidCodepoint,
|
||||
InvalidCodepointContext,
|
||||
alabel,
|
||||
check_bidi,
|
||||
check_hyphen_ok,
|
||||
check_initial_combiner,
|
||||
check_label,
|
||||
check_nfc,
|
||||
decode,
|
||||
encode,
|
||||
ulabel,
|
||||
uts46_remap,
|
||||
valid_contextj,
|
||||
valid_contexto,
|
||||
valid_label_length,
|
||||
valid_string_length,
|
||||
)
|
||||
from .intranges import intranges_contain
|
||||
|
||||
__all__ = [
|
||||
"IDNABidiError",
|
||||
"IDNAError",
|
||||
"InvalidCodepoint",
|
||||
"InvalidCodepointContext",
|
||||
"alabel",
|
||||
"check_bidi",
|
||||
"check_hyphen_ok",
|
||||
"check_initial_combiner",
|
||||
"check_label",
|
||||
"check_nfc",
|
||||
"decode",
|
||||
"encode",
|
||||
"intranges_contain",
|
||||
"ulabel",
|
||||
"uts46_remap",
|
||||
"valid_contextj",
|
||||
"valid_contexto",
|
||||
"valid_label_length",
|
||||
"valid_string_length",
|
||||
]
|
||||
BIN
_vendor/idna/__pycache__/__init__.cpython-310.pyc
Normal file
BIN
_vendor/idna/__pycache__/__init__.cpython-310.pyc
Normal file
Binary file not shown.
BIN
_vendor/idna/__pycache__/codec.cpython-310.pyc
Normal file
BIN
_vendor/idna/__pycache__/codec.cpython-310.pyc
Normal file
Binary file not shown.
BIN
_vendor/idna/__pycache__/compat.cpython-310.pyc
Normal file
BIN
_vendor/idna/__pycache__/compat.cpython-310.pyc
Normal file
Binary file not shown.
BIN
_vendor/idna/__pycache__/core.cpython-310.pyc
Normal file
BIN
_vendor/idna/__pycache__/core.cpython-310.pyc
Normal file
Binary file not shown.
BIN
_vendor/idna/__pycache__/idnadata.cpython-310.pyc
Normal file
BIN
_vendor/idna/__pycache__/idnadata.cpython-310.pyc
Normal file
Binary file not shown.
BIN
_vendor/idna/__pycache__/intranges.cpython-310.pyc
Normal file
BIN
_vendor/idna/__pycache__/intranges.cpython-310.pyc
Normal file
Binary file not shown.
BIN
_vendor/idna/__pycache__/package_data.cpython-310.pyc
Normal file
BIN
_vendor/idna/__pycache__/package_data.cpython-310.pyc
Normal file
Binary file not shown.
BIN
_vendor/idna/__pycache__/uts46data.cpython-310.pyc
Normal file
BIN
_vendor/idna/__pycache__/uts46data.cpython-310.pyc
Normal file
Binary file not shown.
112
_vendor/idna/codec.py
Normal file
112
_vendor/idna/codec.py
Normal file
@@ -0,0 +1,112 @@
|
||||
from .core import encode, decode, alabel, ulabel, IDNAError
|
||||
import codecs
|
||||
import re
|
||||
from typing import Tuple, Optional
|
||||
|
||||
_unicode_dots_re = re.compile('[\u002e\u3002\uff0e\uff61]')
|
||||
|
||||
class Codec(codecs.Codec):
|
||||
|
||||
def encode(self, data: str, errors: str = 'strict') -> Tuple[bytes, int]:
|
||||
if errors != 'strict':
|
||||
raise IDNAError('Unsupported error handling \"{}\"'.format(errors))
|
||||
|
||||
if not data:
|
||||
return b"", 0
|
||||
|
||||
return encode(data), len(data)
|
||||
|
||||
def decode(self, data: bytes, errors: str = 'strict') -> Tuple[str, int]:
|
||||
if errors != 'strict':
|
||||
raise IDNAError('Unsupported error handling \"{}\"'.format(errors))
|
||||
|
||||
if not data:
|
||||
return '', 0
|
||||
|
||||
return decode(data), len(data)
|
||||
|
||||
class IncrementalEncoder(codecs.BufferedIncrementalEncoder):
|
||||
def _buffer_encode(self, data: str, errors: str, final: bool) -> Tuple[str, int]: # type: ignore
|
||||
if errors != 'strict':
|
||||
raise IDNAError('Unsupported error handling \"{}\"'.format(errors))
|
||||
|
||||
if not data:
|
||||
return "", 0
|
||||
|
||||
labels = _unicode_dots_re.split(data)
|
||||
trailing_dot = ''
|
||||
if labels:
|
||||
if not labels[-1]:
|
||||
trailing_dot = '.'
|
||||
del labels[-1]
|
||||
elif not final:
|
||||
# Keep potentially unfinished label until the next call
|
||||
del labels[-1]
|
||||
if labels:
|
||||
trailing_dot = '.'
|
||||
|
||||
result = []
|
||||
size = 0
|
||||
for label in labels:
|
||||
result.append(alabel(label))
|
||||
if size:
|
||||
size += 1
|
||||
size += len(label)
|
||||
|
||||
# Join with U+002E
|
||||
result_str = '.'.join(result) + trailing_dot # type: ignore
|
||||
size += len(trailing_dot)
|
||||
return result_str, size
|
||||
|
||||
class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
|
||||
def _buffer_decode(self, data: str, errors: str, final: bool) -> Tuple[str, int]: # type: ignore
|
||||
if errors != 'strict':
|
||||
raise IDNAError('Unsupported error handling \"{}\"'.format(errors))
|
||||
|
||||
if not data:
|
||||
return ('', 0)
|
||||
|
||||
labels = _unicode_dots_re.split(data)
|
||||
trailing_dot = ''
|
||||
if labels:
|
||||
if not labels[-1]:
|
||||
trailing_dot = '.'
|
||||
del labels[-1]
|
||||
elif not final:
|
||||
# Keep potentially unfinished label until the next call
|
||||
del labels[-1]
|
||||
if labels:
|
||||
trailing_dot = '.'
|
||||
|
||||
result = []
|
||||
size = 0
|
||||
for label in labels:
|
||||
result.append(ulabel(label))
|
||||
if size:
|
||||
size += 1
|
||||
size += len(label)
|
||||
|
||||
result_str = '.'.join(result) + trailing_dot
|
||||
size += len(trailing_dot)
|
||||
return (result_str, size)
|
||||
|
||||
|
||||
class StreamWriter(Codec, codecs.StreamWriter):
|
||||
pass
|
||||
|
||||
|
||||
class StreamReader(Codec, codecs.StreamReader):
|
||||
pass
|
||||
|
||||
|
||||
def getregentry() -> codecs.CodecInfo:
|
||||
# Compatibility as a search_function for codecs.register()
|
||||
return codecs.CodecInfo(
|
||||
name='idna',
|
||||
encode=Codec().encode, # type: ignore
|
||||
decode=Codec().decode, # type: ignore
|
||||
incrementalencoder=IncrementalEncoder,
|
||||
incrementaldecoder=IncrementalDecoder,
|
||||
streamwriter=StreamWriter,
|
||||
streamreader=StreamReader,
|
||||
)
|
||||
13
_vendor/idna/compat.py
Normal file
13
_vendor/idna/compat.py
Normal file
@@ -0,0 +1,13 @@
|
||||
from .core import *
|
||||
from .codec import *
|
||||
from typing import Any, Union
|
||||
|
||||
def ToASCII(label: str) -> bytes:
|
||||
return encode(label)
|
||||
|
||||
def ToUnicode(label: Union[bytes, bytearray]) -> str:
|
||||
return decode(label)
|
||||
|
||||
def nameprep(s: Any) -> None:
|
||||
raise NotImplementedError('IDNA 2008 does not utilise nameprep protocol')
|
||||
|
||||
400
_vendor/idna/core.py
Normal file
400
_vendor/idna/core.py
Normal file
@@ -0,0 +1,400 @@
|
||||
from . import idnadata
|
||||
import bisect
|
||||
import unicodedata
|
||||
import re
|
||||
from typing import Union, Optional
|
||||
from .intranges import intranges_contain
|
||||
|
||||
_virama_combining_class = 9
|
||||
_alabel_prefix = b'xn--'
|
||||
_unicode_dots_re = re.compile('[\u002e\u3002\uff0e\uff61]')
|
||||
|
||||
class IDNAError(UnicodeError):
|
||||
""" Base exception for all IDNA-encoding related problems """
|
||||
pass
|
||||
|
||||
|
||||
class IDNABidiError(IDNAError):
|
||||
""" Exception when bidirectional requirements are not satisfied """
|
||||
pass
|
||||
|
||||
|
||||
class InvalidCodepoint(IDNAError):
|
||||
""" Exception when a disallowed or unallocated codepoint is used """
|
||||
pass
|
||||
|
||||
|
||||
class InvalidCodepointContext(IDNAError):
|
||||
""" Exception when the codepoint is not valid in the context it is used """
|
||||
pass
|
||||
|
||||
|
||||
def _combining_class(cp: int) -> int:
|
||||
v = unicodedata.combining(chr(cp))
|
||||
if v == 0:
|
||||
if not unicodedata.name(chr(cp)):
|
||||
raise ValueError('Unknown character in unicodedata')
|
||||
return v
|
||||
|
||||
def _is_script(cp: str, script: str) -> bool:
|
||||
return intranges_contain(ord(cp), idnadata.scripts[script])
|
||||
|
||||
def _punycode(s: str) -> bytes:
|
||||
return s.encode('punycode')
|
||||
|
||||
def _unot(s: int) -> str:
|
||||
return 'U+{:04X}'.format(s)
|
||||
|
||||
|
||||
def valid_label_length(label: Union[bytes, str]) -> bool:
|
||||
if len(label) > 63:
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def valid_string_length(label: Union[bytes, str], trailing_dot: bool) -> bool:
|
||||
if len(label) > (254 if trailing_dot else 253):
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def check_bidi(label: str, check_ltr: bool = False) -> bool:
|
||||
# Bidi rules should only be applied if string contains RTL characters
|
||||
bidi_label = False
|
||||
for (idx, cp) in enumerate(label, 1):
|
||||
direction = unicodedata.bidirectional(cp)
|
||||
if direction == '':
|
||||
# String likely comes from a newer version of Unicode
|
||||
raise IDNABidiError('Unknown directionality in label {} at position {}'.format(repr(label), idx))
|
||||
if direction in ['R', 'AL', 'AN']:
|
||||
bidi_label = True
|
||||
if not bidi_label and not check_ltr:
|
||||
return True
|
||||
|
||||
# Bidi rule 1
|
||||
direction = unicodedata.bidirectional(label[0])
|
||||
if direction in ['R', 'AL']:
|
||||
rtl = True
|
||||
elif direction == 'L':
|
||||
rtl = False
|
||||
else:
|
||||
raise IDNABidiError('First codepoint in label {} must be directionality L, R or AL'.format(repr(label)))
|
||||
|
||||
valid_ending = False
|
||||
number_type = None # type: Optional[str]
|
||||
for (idx, cp) in enumerate(label, 1):
|
||||
direction = unicodedata.bidirectional(cp)
|
||||
|
||||
if rtl:
|
||||
# Bidi rule 2
|
||||
if not direction in ['R', 'AL', 'AN', 'EN', 'ES', 'CS', 'ET', 'ON', 'BN', 'NSM']:
|
||||
raise IDNABidiError('Invalid direction for codepoint at position {} in a right-to-left label'.format(idx))
|
||||
# Bidi rule 3
|
||||
if direction in ['R', 'AL', 'EN', 'AN']:
|
||||
valid_ending = True
|
||||
elif direction != 'NSM':
|
||||
valid_ending = False
|
||||
# Bidi rule 4
|
||||
if direction in ['AN', 'EN']:
|
||||
if not number_type:
|
||||
number_type = direction
|
||||
else:
|
||||
if number_type != direction:
|
||||
raise IDNABidiError('Can not mix numeral types in a right-to-left label')
|
||||
else:
|
||||
# Bidi rule 5
|
||||
if not direction in ['L', 'EN', 'ES', 'CS', 'ET', 'ON', 'BN', 'NSM']:
|
||||
raise IDNABidiError('Invalid direction for codepoint at position {} in a left-to-right label'.format(idx))
|
||||
# Bidi rule 6
|
||||
if direction in ['L', 'EN']:
|
||||
valid_ending = True
|
||||
elif direction != 'NSM':
|
||||
valid_ending = False
|
||||
|
||||
if not valid_ending:
|
||||
raise IDNABidiError('Label ends with illegal codepoint directionality')
|
||||
|
||||
return True
|
||||
|
||||
|
||||
def check_initial_combiner(label: str) -> bool:
|
||||
if unicodedata.category(label[0])[0] == 'M':
|
||||
raise IDNAError('Label begins with an illegal combining character')
|
||||
return True
|
||||
|
||||
|
||||
def check_hyphen_ok(label: str) -> bool:
|
||||
if label[2:4] == '--':
|
||||
raise IDNAError('Label has disallowed hyphens in 3rd and 4th position')
|
||||
if label[0] == '-' or label[-1] == '-':
|
||||
raise IDNAError('Label must not start or end with a hyphen')
|
||||
return True
|
||||
|
||||
|
||||
def check_nfc(label: str) -> None:
|
||||
if unicodedata.normalize('NFC', label) != label:
|
||||
raise IDNAError('Label must be in Normalization Form C')
|
||||
|
||||
|
||||
def valid_contextj(label: str, pos: int) -> bool:
|
||||
cp_value = ord(label[pos])
|
||||
|
||||
if cp_value == 0x200c:
|
||||
|
||||
if pos > 0:
|
||||
if _combining_class(ord(label[pos - 1])) == _virama_combining_class:
|
||||
return True
|
||||
|
||||
ok = False
|
||||
for i in range(pos-1, -1, -1):
|
||||
joining_type = idnadata.joining_types.get(ord(label[i]))
|
||||
if joining_type == ord('T'):
|
||||
continue
|
||||
if joining_type in [ord('L'), ord('D')]:
|
||||
ok = True
|
||||
break
|
||||
|
||||
if not ok:
|
||||
return False
|
||||
|
||||
ok = False
|
||||
for i in range(pos+1, len(label)):
|
||||
joining_type = idnadata.joining_types.get(ord(label[i]))
|
||||
if joining_type == ord('T'):
|
||||
continue
|
||||
if joining_type in [ord('R'), ord('D')]:
|
||||
ok = True
|
||||
break
|
||||
return ok
|
||||
|
||||
if cp_value == 0x200d:
|
||||
|
||||
if pos > 0:
|
||||
if _combining_class(ord(label[pos - 1])) == _virama_combining_class:
|
||||
return True
|
||||
return False
|
||||
|
||||
else:
|
||||
|
||||
return False
|
||||
|
||||
|
||||
def valid_contexto(label: str, pos: int, exception: bool = False) -> bool:
|
||||
cp_value = ord(label[pos])
|
||||
|
||||
if cp_value == 0x00b7:
|
||||
if 0 < pos < len(label)-1:
|
||||
if ord(label[pos - 1]) == 0x006c and ord(label[pos + 1]) == 0x006c:
|
||||
return True
|
||||
return False
|
||||
|
||||
elif cp_value == 0x0375:
|
||||
if pos < len(label)-1 and len(label) > 1:
|
||||
return _is_script(label[pos + 1], 'Greek')
|
||||
return False
|
||||
|
||||
elif cp_value == 0x05f3 or cp_value == 0x05f4:
|
||||
if pos > 0:
|
||||
return _is_script(label[pos - 1], 'Hebrew')
|
||||
return False
|
||||
|
||||
elif cp_value == 0x30fb:
|
||||
for cp in label:
|
||||
if cp == '\u30fb':
|
||||
continue
|
||||
if _is_script(cp, 'Hiragana') or _is_script(cp, 'Katakana') or _is_script(cp, 'Han'):
|
||||
return True
|
||||
return False
|
||||
|
||||
elif 0x660 <= cp_value <= 0x669:
|
||||
for cp in label:
|
||||
if 0x6f0 <= ord(cp) <= 0x06f9:
|
||||
return False
|
||||
return True
|
||||
|
||||
elif 0x6f0 <= cp_value <= 0x6f9:
|
||||
for cp in label:
|
||||
if 0x660 <= ord(cp) <= 0x0669:
|
||||
return False
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
|
||||
def check_label(label: Union[str, bytes, bytearray]) -> None:
|
||||
if isinstance(label, (bytes, bytearray)):
|
||||
label = label.decode('utf-8')
|
||||
if len(label) == 0:
|
||||
raise IDNAError('Empty Label')
|
||||
|
||||
check_nfc(label)
|
||||
check_hyphen_ok(label)
|
||||
check_initial_combiner(label)
|
||||
|
||||
for (pos, cp) in enumerate(label):
|
||||
cp_value = ord(cp)
|
||||
if intranges_contain(cp_value, idnadata.codepoint_classes['PVALID']):
|
||||
continue
|
||||
elif intranges_contain(cp_value, idnadata.codepoint_classes['CONTEXTJ']):
|
||||
try:
|
||||
if not valid_contextj(label, pos):
|
||||
raise InvalidCodepointContext('Joiner {} not allowed at position {} in {}'.format(
|
||||
_unot(cp_value), pos+1, repr(label)))
|
||||
except ValueError:
|
||||
raise IDNAError('Unknown codepoint adjacent to joiner {} at position {} in {}'.format(
|
||||
_unot(cp_value), pos+1, repr(label)))
|
||||
elif intranges_contain(cp_value, idnadata.codepoint_classes['CONTEXTO']):
|
||||
if not valid_contexto(label, pos):
|
||||
raise InvalidCodepointContext('Codepoint {} not allowed at position {} in {}'.format(_unot(cp_value), pos+1, repr(label)))
|
||||
else:
|
||||
raise InvalidCodepoint('Codepoint {} at position {} of {} not allowed'.format(_unot(cp_value), pos+1, repr(label)))
|
||||
|
||||
check_bidi(label)
|
||||
|
||||
|
||||
def alabel(label: str) -> bytes:
|
||||
try:
|
||||
label_bytes = label.encode('ascii')
|
||||
ulabel(label_bytes)
|
||||
if not valid_label_length(label_bytes):
|
||||
raise IDNAError('Label too long')
|
||||
return label_bytes
|
||||
except UnicodeEncodeError:
|
||||
pass
|
||||
|
||||
if not label:
|
||||
raise IDNAError('No Input')
|
||||
|
||||
label = str(label)
|
||||
check_label(label)
|
||||
label_bytes = _punycode(label)
|
||||
label_bytes = _alabel_prefix + label_bytes
|
||||
|
||||
if not valid_label_length(label_bytes):
|
||||
raise IDNAError('Label too long')
|
||||
|
||||
return label_bytes
|
||||
|
||||
|
||||
def ulabel(label: Union[str, bytes, bytearray]) -> str:
|
||||
if not isinstance(label, (bytes, bytearray)):
|
||||
try:
|
||||
label_bytes = label.encode('ascii')
|
||||
except UnicodeEncodeError:
|
||||
check_label(label)
|
||||
return label
|
||||
else:
|
||||
label_bytes = label
|
||||
|
||||
label_bytes = label_bytes.lower()
|
||||
if label_bytes.startswith(_alabel_prefix):
|
||||
label_bytes = label_bytes[len(_alabel_prefix):]
|
||||
if not label_bytes:
|
||||
raise IDNAError('Malformed A-label, no Punycode eligible content found')
|
||||
if label_bytes.decode('ascii')[-1] == '-':
|
||||
raise IDNAError('A-label must not end with a hyphen')
|
||||
else:
|
||||
check_label(label_bytes)
|
||||
return label_bytes.decode('ascii')
|
||||
|
||||
try:
|
||||
label = label_bytes.decode('punycode')
|
||||
except UnicodeError:
|
||||
raise IDNAError('Invalid A-label')
|
||||
check_label(label)
|
||||
return label
|
||||
|
||||
|
||||
def uts46_remap(domain: str, std3_rules: bool = True, transitional: bool = False) -> str:
|
||||
"""Re-map the characters in the string according to UTS46 processing."""
|
||||
from .uts46data import uts46data
|
||||
output = ''
|
||||
|
||||
for pos, char in enumerate(domain):
|
||||
code_point = ord(char)
|
||||
try:
|
||||
uts46row = uts46data[code_point if code_point < 256 else
|
||||
bisect.bisect_left(uts46data, (code_point, 'Z')) - 1]
|
||||
status = uts46row[1]
|
||||
replacement = None # type: Optional[str]
|
||||
if len(uts46row) == 3:
|
||||
replacement = uts46row[2] # type: ignore
|
||||
if (status == 'V' or
|
||||
(status == 'D' and not transitional) or
|
||||
(status == '3' and not std3_rules and replacement is None)):
|
||||
output += char
|
||||
elif replacement is not None and (status == 'M' or
|
||||
(status == '3' and not std3_rules) or
|
||||
(status == 'D' and transitional)):
|
||||
output += replacement
|
||||
elif status != 'I':
|
||||
raise IndexError()
|
||||
except IndexError:
|
||||
raise InvalidCodepoint(
|
||||
'Codepoint {} not allowed at position {} in {}'.format(
|
||||
_unot(code_point), pos + 1, repr(domain)))
|
||||
|
||||
return unicodedata.normalize('NFC', output)
|
||||
|
||||
|
||||
def encode(s: Union[str, bytes, bytearray], strict: bool = False, uts46: bool = False, std3_rules: bool = False, transitional: bool = False) -> bytes:
|
||||
if isinstance(s, (bytes, bytearray)):
|
||||
try:
|
||||
s = s.decode('ascii')
|
||||
except UnicodeDecodeError:
|
||||
raise IDNAError('should pass a unicode string to the function rather than a byte string.')
|
||||
if uts46:
|
||||
s = uts46_remap(s, std3_rules, transitional)
|
||||
trailing_dot = False
|
||||
result = []
|
||||
if strict:
|
||||
labels = s.split('.')
|
||||
else:
|
||||
labels = _unicode_dots_re.split(s)
|
||||
if not labels or labels == ['']:
|
||||
raise IDNAError('Empty domain')
|
||||
if labels[-1] == '':
|
||||
del labels[-1]
|
||||
trailing_dot = True
|
||||
for label in labels:
|
||||
s = alabel(label)
|
||||
if s:
|
||||
result.append(s)
|
||||
else:
|
||||
raise IDNAError('Empty label')
|
||||
if trailing_dot:
|
||||
result.append(b'')
|
||||
s = b'.'.join(result)
|
||||
if not valid_string_length(s, trailing_dot):
|
||||
raise IDNAError('Domain too long')
|
||||
return s
|
||||
|
||||
|
||||
def decode(s: Union[str, bytes, bytearray], strict: bool = False, uts46: bool = False, std3_rules: bool = False) -> str:
|
||||
try:
|
||||
if isinstance(s, (bytes, bytearray)):
|
||||
s = s.decode('ascii')
|
||||
except UnicodeDecodeError:
|
||||
raise IDNAError('Invalid ASCII in A-label')
|
||||
if uts46:
|
||||
s = uts46_remap(s, std3_rules, False)
|
||||
trailing_dot = False
|
||||
result = []
|
||||
if not strict:
|
||||
labels = _unicode_dots_re.split(s)
|
||||
else:
|
||||
labels = s.split('.')
|
||||
if not labels or labels == ['']:
|
||||
raise IDNAError('Empty domain')
|
||||
if not labels[-1]:
|
||||
del labels[-1]
|
||||
trailing_dot = True
|
||||
for label in labels:
|
||||
s = ulabel(label)
|
||||
if s:
|
||||
result.append(s)
|
||||
else:
|
||||
raise IDNAError('Empty label')
|
||||
if trailing_dot:
|
||||
result.append('')
|
||||
return '.'.join(result)
|
||||
2151
_vendor/idna/idnadata.py
Normal file
2151
_vendor/idna/idnadata.py
Normal file
File diff suppressed because it is too large
Load Diff
54
_vendor/idna/intranges.py
Normal file
54
_vendor/idna/intranges.py
Normal file
@@ -0,0 +1,54 @@
|
||||
"""
|
||||
Given a list of integers, made up of (hopefully) a small number of long runs
|
||||
of consecutive integers, compute a representation of the form
|
||||
((start1, end1), (start2, end2) ...). Then answer the question "was x present
|
||||
in the original list?" in time O(log(# runs)).
|
||||
"""
|
||||
|
||||
import bisect
|
||||
from typing import List, Tuple
|
||||
|
||||
def intranges_from_list(list_: List[int]) -> Tuple[int, ...]:
|
||||
"""Represent a list of integers as a sequence of ranges:
|
||||
((start_0, end_0), (start_1, end_1), ...), such that the original
|
||||
integers are exactly those x such that start_i <= x < end_i for some i.
|
||||
|
||||
Ranges are encoded as single integers (start << 32 | end), not as tuples.
|
||||
"""
|
||||
|
||||
sorted_list = sorted(list_)
|
||||
ranges = []
|
||||
last_write = -1
|
||||
for i in range(len(sorted_list)):
|
||||
if i+1 < len(sorted_list):
|
||||
if sorted_list[i] == sorted_list[i+1]-1:
|
||||
continue
|
||||
current_range = sorted_list[last_write+1:i+1]
|
||||
ranges.append(_encode_range(current_range[0], current_range[-1] + 1))
|
||||
last_write = i
|
||||
|
||||
return tuple(ranges)
|
||||
|
||||
def _encode_range(start: int, end: int) -> int:
|
||||
return (start << 32) | end
|
||||
|
||||
def _decode_range(r: int) -> Tuple[int, int]:
|
||||
return (r >> 32), (r & ((1 << 32) - 1))
|
||||
|
||||
|
||||
def intranges_contain(int_: int, ranges: Tuple[int, ...]) -> bool:
|
||||
"""Determine if `int_` falls into one of the ranges in `ranges`."""
|
||||
tuple_ = _encode_range(int_, 0)
|
||||
pos = bisect.bisect_left(ranges, tuple_)
|
||||
# we could be immediately ahead of a tuple (start, end)
|
||||
# with start < int_ <= end
|
||||
if pos > 0:
|
||||
left, right = _decode_range(ranges[pos-1])
|
||||
if left <= int_ < right:
|
||||
return True
|
||||
# or we could be immediately behind a tuple (int_, end)
|
||||
if pos < len(ranges):
|
||||
left, _ = _decode_range(ranges[pos])
|
||||
if left == int_:
|
||||
return True
|
||||
return False
|
||||
2
_vendor/idna/package_data.py
Normal file
2
_vendor/idna/package_data.py
Normal file
@@ -0,0 +1,2 @@
|
||||
__version__ = '3.4'
|
||||
|
||||
0
_vendor/idna/py.typed
Normal file
0
_vendor/idna/py.typed
Normal file
8600
_vendor/idna/uts46data.py
Normal file
8600
_vendor/idna/uts46data.py
Normal file
File diff suppressed because it is too large
Load Diff
1
_vendor/requests-2.28.1.dist-info/INSTALLER
Normal file
1
_vendor/requests-2.28.1.dist-info/INSTALLER
Normal file
@@ -0,0 +1 @@
|
||||
pip
|
||||
175
_vendor/requests-2.28.1.dist-info/LICENSE
Normal file
175
_vendor/requests-2.28.1.dist-info/LICENSE
Normal file
@@ -0,0 +1,175 @@
|
||||
|
||||
Apache License
|
||||
Version 2.0, January 2004
|
||||
http://www.apache.org/licenses/
|
||||
|
||||
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
||||
|
||||
1. Definitions.
|
||||
|
||||
"License" shall mean the terms and conditions for use, reproduction,
|
||||
and distribution as defined by Sections 1 through 9 of this document.
|
||||
|
||||
"Licensor" shall mean the copyright owner or entity authorized by
|
||||
the copyright owner that is granting the License.
|
||||
|
||||
"Legal Entity" shall mean the union of the acting entity and all
|
||||
other entities that control, are controlled by, or are under common
|
||||
control with that entity. For the purposes of this definition,
|
||||
"control" means (i) the power, direct or indirect, to cause the
|
||||
direction or management of such entity, whether by contract or
|
||||
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
||||
outstanding shares, or (iii) beneficial ownership of such entity.
|
||||
|
||||
"You" (or "Your") shall mean an individual or Legal Entity
|
||||
exercising permissions granted by this License.
|
||||
|
||||
"Source" form shall mean the preferred form for making modifications,
|
||||
including but not limited to software source code, documentation
|
||||
source, and configuration files.
|
||||
|
||||
"Object" form shall mean any form resulting from mechanical
|
||||
transformation or translation of a Source form, including but
|
||||
not limited to compiled object code, generated documentation,
|
||||
and conversions to other media types.
|
||||
|
||||
"Work" shall mean the work of authorship, whether in Source or
|
||||
Object form, made available under the License, as indicated by a
|
||||
copyright notice that is included in or attached to the work
|
||||
(an example is provided in the Appendix below).
|
||||
|
||||
"Derivative Works" shall mean any work, whether in Source or Object
|
||||
form, that is based on (or derived from) the Work and for which the
|
||||
editorial revisions, annotations, elaborations, or other modifications
|
||||
represent, as a whole, an original work of authorship. For the purposes
|
||||
of this License, Derivative Works shall not include works that remain
|
||||
separable from, or merely link (or bind by name) to the interfaces of,
|
||||
the Work and Derivative Works thereof.
|
||||
|
||||
"Contribution" shall mean any work of authorship, including
|
||||
the original version of the Work and any modifications or additions
|
||||
to that Work or Derivative Works thereof, that is intentionally
|
||||
submitted to Licensor for inclusion in the Work by the copyright owner
|
||||
or by an individual or Legal Entity authorized to submit on behalf of
|
||||
the copyright owner. For the purposes of this definition, "submitted"
|
||||
means any form of electronic, verbal, or written communication sent
|
||||
to the Licensor or its representatives, including but not limited to
|
||||
communication on electronic mailing lists, source code control systems,
|
||||
and issue tracking systems that are managed by, or on behalf of, the
|
||||
Licensor for the purpose of discussing and improving the Work, but
|
||||
excluding communication that is conspicuously marked or otherwise
|
||||
designated in writing by the copyright owner as "Not a Contribution."
|
||||
|
||||
"Contributor" shall mean Licensor and any individual or Legal Entity
|
||||
on behalf of whom a Contribution has been received by Licensor and
|
||||
subsequently incorporated within the Work.
|
||||
|
||||
2. Grant of Copyright License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
copyright license to reproduce, prepare Derivative Works of,
|
||||
publicly display, publicly perform, sublicense, and distribute the
|
||||
Work and such Derivative Works in Source or Object form.
|
||||
|
||||
3. Grant of Patent License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
(except as stated in this section) patent license to make, have made,
|
||||
use, offer to sell, sell, import, and otherwise transfer the Work,
|
||||
where such license applies only to those patent claims licensable
|
||||
by such Contributor that are necessarily infringed by their
|
||||
Contribution(s) alone or by combination of their Contribution(s)
|
||||
with the Work to which such Contribution(s) was submitted. If You
|
||||
institute patent litigation against any entity (including a
|
||||
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
||||
or a Contribution incorporated within the Work constitutes direct
|
||||
or contributory patent infringement, then any patent licenses
|
||||
granted to You under this License for that Work shall terminate
|
||||
as of the date such litigation is filed.
|
||||
|
||||
4. Redistribution. You may reproduce and distribute copies of the
|
||||
Work or Derivative Works thereof in any medium, with or without
|
||||
modifications, and in Source or Object form, provided that You
|
||||
meet the following conditions:
|
||||
|
||||
(a) You must give any other recipients of the Work or
|
||||
Derivative Works a copy of this License; and
|
||||
|
||||
(b) You must cause any modified files to carry prominent notices
|
||||
stating that You changed the files; and
|
||||
|
||||
(c) You must retain, in the Source form of any Derivative Works
|
||||
that You distribute, all copyright, patent, trademark, and
|
||||
attribution notices from the Source form of the Work,
|
||||
excluding those notices that do not pertain to any part of
|
||||
the Derivative Works; and
|
||||
|
||||
(d) If the Work includes a "NOTICE" text file as part of its
|
||||
distribution, then any Derivative Works that You distribute must
|
||||
include a readable copy of the attribution notices contained
|
||||
within such NOTICE file, excluding those notices that do not
|
||||
pertain to any part of the Derivative Works, in at least one
|
||||
of the following places: within a NOTICE text file distributed
|
||||
as part of the Derivative Works; within the Source form or
|
||||
documentation, if provided along with the Derivative Works; or,
|
||||
within a display generated by the Derivative Works, if and
|
||||
wherever such third-party notices normally appear. The contents
|
||||
of the NOTICE file are for informational purposes only and
|
||||
do not modify the License. You may add Your own attribution
|
||||
notices within Derivative Works that You distribute, alongside
|
||||
or as an addendum to the NOTICE text from the Work, provided
|
||||
that such additional attribution notices cannot be construed
|
||||
as modifying the License.
|
||||
|
||||
You may add Your own copyright statement to Your modifications and
|
||||
may provide additional or different license terms and conditions
|
||||
for use, reproduction, or distribution of Your modifications, or
|
||||
for any such Derivative Works as a whole, provided Your use,
|
||||
reproduction, and distribution of the Work otherwise complies with
|
||||
the conditions stated in this License.
|
||||
|
||||
5. Submission of Contributions. Unless You explicitly state otherwise,
|
||||
any Contribution intentionally submitted for inclusion in the Work
|
||||
by You to the Licensor shall be under the terms and conditions of
|
||||
this License, without any additional terms or conditions.
|
||||
Notwithstanding the above, nothing herein shall supersede or modify
|
||||
the terms of any separate license agreement you may have executed
|
||||
with Licensor regarding such Contributions.
|
||||
|
||||
6. Trademarks. This License does not grant permission to use the trade
|
||||
names, trademarks, service marks, or product names of the Licensor,
|
||||
except as required for reasonable and customary use in describing the
|
||||
origin of the Work and reproducing the content of the NOTICE file.
|
||||
|
||||
7. Disclaimer of Warranty. Unless required by applicable law or
|
||||
agreed to in writing, Licensor provides the Work (and each
|
||||
Contributor provides its Contributions) on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
||||
implied, including, without limitation, any warranties or conditions
|
||||
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
||||
PARTICULAR PURPOSE. You are solely responsible for determining the
|
||||
appropriateness of using or redistributing the Work and assume any
|
||||
risks associated with Your exercise of permissions under this License.
|
||||
|
||||
8. Limitation of Liability. In no event and under no legal theory,
|
||||
whether in tort (including negligence), contract, or otherwise,
|
||||
unless required by applicable law (such as deliberate and grossly
|
||||
negligent acts) or agreed to in writing, shall any Contributor be
|
||||
liable to You for damages, including any direct, indirect, special,
|
||||
incidental, or consequential damages of any character arising as a
|
||||
result of this License or out of the use or inability to use the
|
||||
Work (including but not limited to damages for loss of goodwill,
|
||||
work stoppage, computer failure or malfunction, or any and all
|
||||
other commercial damages or losses), even if such Contributor
|
||||
has been advised of the possibility of such damages.
|
||||
|
||||
9. Accepting Warranty or Additional Liability. While redistributing
|
||||
the Work or Derivative Works thereof, You may choose to offer,
|
||||
and charge a fee for, acceptance of support, warranty, indemnity,
|
||||
or other liability obligations and/or rights consistent with this
|
||||
License. However, in accepting such obligations, You may act only
|
||||
on Your own behalf and on Your sole responsibility, not on behalf
|
||||
of any other Contributor, and only if You agree to indemnify,
|
||||
defend, and hold each Contributor harmless for any liability
|
||||
incurred by, or claims asserted against, such Contributor by reason
|
||||
of your accepting any such warranty or additional liability.
|
||||
122
_vendor/requests-2.28.1.dist-info/METADATA
Normal file
122
_vendor/requests-2.28.1.dist-info/METADATA
Normal file
@@ -0,0 +1,122 @@
|
||||
Metadata-Version: 2.1
|
||||
Name: requests
|
||||
Version: 2.28.1
|
||||
Summary: Python HTTP for Humans.
|
||||
Home-page: https://requests.readthedocs.io
|
||||
Author: Kenneth Reitz
|
||||
Author-email: me@kennethreitz.org
|
||||
License: Apache 2.0
|
||||
Project-URL: Documentation, https://requests.readthedocs.io
|
||||
Project-URL: Source, https://github.com/psf/requests
|
||||
Platform: UNKNOWN
|
||||
Classifier: Development Status :: 5 - Production/Stable
|
||||
Classifier: Environment :: Web Environment
|
||||
Classifier: Intended Audience :: Developers
|
||||
Classifier: License :: OSI Approved :: Apache Software License
|
||||
Classifier: Natural Language :: English
|
||||
Classifier: Operating System :: OS Independent
|
||||
Classifier: Programming Language :: Python
|
||||
Classifier: Programming Language :: Python :: 3
|
||||
Classifier: Programming Language :: Python :: 3.7
|
||||
Classifier: Programming Language :: Python :: 3.8
|
||||
Classifier: Programming Language :: Python :: 3.9
|
||||
Classifier: Programming Language :: Python :: 3.10
|
||||
Classifier: Programming Language :: Python :: 3.11
|
||||
Classifier: Programming Language :: Python :: 3 :: Only
|
||||
Classifier: Programming Language :: Python :: Implementation :: CPython
|
||||
Classifier: Programming Language :: Python :: Implementation :: PyPy
|
||||
Classifier: Topic :: Internet :: WWW/HTTP
|
||||
Classifier: Topic :: Software Development :: Libraries
|
||||
Requires-Python: >=3.7, <4
|
||||
Description-Content-Type: text/markdown
|
||||
License-File: LICENSE
|
||||
Requires-Dist: charset-normalizer (<3,>=2)
|
||||
Requires-Dist: idna (<4,>=2.5)
|
||||
Requires-Dist: urllib3 (<1.27,>=1.21.1)
|
||||
Requires-Dist: certifi (>=2017.4.17)
|
||||
Provides-Extra: security
|
||||
Provides-Extra: socks
|
||||
Requires-Dist: PySocks (!=1.5.7,>=1.5.6) ; extra == 'socks'
|
||||
Provides-Extra: use_chardet_on_py3
|
||||
Requires-Dist: chardet (<6,>=3.0.2) ; extra == 'use_chardet_on_py3'
|
||||
|
||||
# Requests
|
||||
|
||||
**Requests** is a simple, yet elegant, HTTP library.
|
||||
|
||||
```python
|
||||
>>> import requests
|
||||
>>> r = requests.get('https://httpbin.org/basic-auth/user/pass', auth=('user', 'pass'))
|
||||
>>> r.status_code
|
||||
200
|
||||
>>> r.headers['content-type']
|
||||
'application/json; charset=utf8'
|
||||
>>> r.encoding
|
||||
'utf-8'
|
||||
>>> r.text
|
||||
'{"authenticated": true, ...'
|
||||
>>> r.json()
|
||||
{'authenticated': True, ...}
|
||||
```
|
||||
|
||||
Requests allows you to send HTTP/1.1 requests extremely easily. There’s no need to manually add query strings to your URLs, or to form-encode your `PUT` & `POST` data — but nowadays, just use the `json` method!
|
||||
|
||||
Requests is one of the most downloaded Python packages today, pulling in around `30M downloads / week`— according to GitHub, Requests is currently [depended upon](https://github.com/psf/requests/network/dependents?package_id=UGFja2FnZS01NzA4OTExNg%3D%3D) by `1,000,000+` repositories. You may certainly put your trust in this code.
|
||||
|
||||
[](https://pepy.tech/project/requests)
|
||||
[](https://pypi.org/project/requests)
|
||||
[](https://github.com/psf/requests/graphs/contributors)
|
||||
|
||||
## Installing Requests and Supported Versions
|
||||
|
||||
Requests is available on PyPI:
|
||||
|
||||
```console
|
||||
$ python -m pip install requests
|
||||
```
|
||||
|
||||
Requests officially supports Python 3.7+.
|
||||
|
||||
## Supported Features & Best–Practices
|
||||
|
||||
Requests is ready for the demands of building robust and reliable HTTP–speaking applications, for the needs of today.
|
||||
|
||||
- Keep-Alive & Connection Pooling
|
||||
- International Domains and URLs
|
||||
- Sessions with Cookie Persistence
|
||||
- Browser-style TLS/SSL Verification
|
||||
- Basic & Digest Authentication
|
||||
- Familiar `dict`–like Cookies
|
||||
- Automatic Content Decompression and Decoding
|
||||
- Multi-part File Uploads
|
||||
- SOCKS Proxy Support
|
||||
- Connection Timeouts
|
||||
- Streaming Downloads
|
||||
- Automatic honoring of `.netrc`
|
||||
- Chunked HTTP Requests
|
||||
|
||||
## API Reference and User Guide available on [Read the Docs](https://requests.readthedocs.io)
|
||||
|
||||
[](https://requests.readthedocs.io)
|
||||
|
||||
## Cloning the repository
|
||||
|
||||
When cloning the Requests repository, you may need to add the `-c
|
||||
fetch.fsck.badTimezone=ignore` flag to avoid an error about a bad commit (see
|
||||
[this issue](https://github.com/psf/requests/issues/2690) for more background):
|
||||
|
||||
```shell
|
||||
git clone -c fetch.fsck.badTimezone=ignore https://github.com/psf/requests.git
|
||||
```
|
||||
|
||||
You can also apply this setting to your global Git config:
|
||||
|
||||
```shell
|
||||
git config --global fetch.fsck.badTimezone ignore
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
[](https://kennethreitz.org) [](https://www.python.org/psf)
|
||||
|
||||
|
||||
43
_vendor/requests-2.28.1.dist-info/RECORD
Normal file
43
_vendor/requests-2.28.1.dist-info/RECORD
Normal file
@@ -0,0 +1,43 @@
|
||||
requests-2.28.1.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
|
||||
requests-2.28.1.dist-info/LICENSE,sha256=CeipvOyAZxBGUsFoaFqwkx54aPnIKEtm9a5u2uXxEws,10142
|
||||
requests-2.28.1.dist-info/METADATA,sha256=eoNYSJuPWbql7Til9dKPb--KRU2ouGR4g7UxtZFoHNU,4641
|
||||
requests-2.28.1.dist-info/RECORD,,
|
||||
requests-2.28.1.dist-info/REQUESTED,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
||||
requests-2.28.1.dist-info/WHEEL,sha256=G16H4A3IeoQmnOrYV4ueZGKSjhipXx8zc8nu9FGlvMA,92
|
||||
requests-2.28.1.dist-info/top_level.txt,sha256=fMSVmHfb5rbGOo6xv-O_tUX6j-WyixssE-SnwcDRxNQ,9
|
||||
requests/__init__.py,sha256=S2K0jnVP6CSrT51SctFyiB0XfI8H9Nt7EqzERAD44gg,4972
|
||||
requests/__pycache__/__init__.cpython-310.pyc,,
|
||||
requests/__pycache__/__version__.cpython-310.pyc,,
|
||||
requests/__pycache__/_internal_utils.cpython-310.pyc,,
|
||||
requests/__pycache__/adapters.cpython-310.pyc,,
|
||||
requests/__pycache__/api.cpython-310.pyc,,
|
||||
requests/__pycache__/auth.cpython-310.pyc,,
|
||||
requests/__pycache__/certs.cpython-310.pyc,,
|
||||
requests/__pycache__/compat.cpython-310.pyc,,
|
||||
requests/__pycache__/cookies.cpython-310.pyc,,
|
||||
requests/__pycache__/exceptions.cpython-310.pyc,,
|
||||
requests/__pycache__/help.cpython-310.pyc,,
|
||||
requests/__pycache__/hooks.cpython-310.pyc,,
|
||||
requests/__pycache__/models.cpython-310.pyc,,
|
||||
requests/__pycache__/packages.cpython-310.pyc,,
|
||||
requests/__pycache__/sessions.cpython-310.pyc,,
|
||||
requests/__pycache__/status_codes.cpython-310.pyc,,
|
||||
requests/__pycache__/structures.cpython-310.pyc,,
|
||||
requests/__pycache__/utils.cpython-310.pyc,,
|
||||
requests/__version__.py,sha256=nJVa3ef2yRyeYMhy7yHnRyjjpnNTDykZsE4Sp9irBC4,440
|
||||
requests/_internal_utils.py,sha256=aSPlF4uDhtfKxEayZJJ7KkAxtormeTfpwKSBSwtmAUw,1397
|
||||
requests/adapters.py,sha256=sEnHGl4mJz4QHBT8jG6bU5aPinUtdoH3BIuAIzT-X74,21287
|
||||
requests/api.py,sha256=dyvkDd5itC9z2g0wHl_YfD1yf6YwpGWLO7__8e21nks,6377
|
||||
requests/auth.py,sha256=h-HLlVx9j8rKV5hfSAycP2ApOSglTz77R0tz7qCbbEE,10187
|
||||
requests/certs.py,sha256=Z9Sb410Anv6jUFTyss0jFFhU6xst8ctELqfy8Ev23gw,429
|
||||
requests/compat.py,sha256=yxntVOSEHGMrn7FNr_32EEam1ZNAdPRdSE13_yaHzTk,1451
|
||||
requests/cookies.py,sha256=kD3kNEcCj-mxbtf5fJsSaT86eGoEYpD3X0CSgpzl7BM,18560
|
||||
requests/exceptions.py,sha256=DhveFBclVjTRxhRduVpO-GbMYMID2gmjdLfNEqNpI_U,3811
|
||||
requests/help.py,sha256=gPX5d_H7Xd88aDABejhqGgl9B1VFRTt5BmiYvL3PzIQ,3875
|
||||
requests/hooks.py,sha256=CiuysiHA39V5UfcCBXFIx83IrDpuwfN9RcTUgv28ftQ,733
|
||||
requests/models.py,sha256=OiVxiOdlhzpbZoxut2OhKtpYlB7WW4iHQcfqSVmT4H4,35222
|
||||
requests/packages.py,sha256=DXgv-FJIczZITmv0vEBAhWj4W-5CGCIN_ksvgR17Dvs,957
|
||||
requests/sessions.py,sha256=KUqJcRRLovNefUs7ScOXSUVCcfSayTFWtbiJ7gOSlTI,30180
|
||||
requests/status_codes.py,sha256=FvHmT5uH-_uimtRz5hH9VCbt7VV-Nei2J9upbej6j8g,4235
|
||||
requests/structures.py,sha256=-IbmhVz06S-5aPSZuUthZ6-6D9XOjRuTXHOabY041XM,2912
|
||||
requests/utils.py,sha256=5_ws-bsKI9EHl7j27yi-6HFzPBKultPgd7HfPrUToWI,33228
|
||||
0
_vendor/requests-2.28.1.dist-info/REQUESTED
Normal file
0
_vendor/requests-2.28.1.dist-info/REQUESTED
Normal file
5
_vendor/requests-2.28.1.dist-info/WHEEL
Normal file
5
_vendor/requests-2.28.1.dist-info/WHEEL
Normal file
@@ -0,0 +1,5 @@
|
||||
Wheel-Version: 1.0
|
||||
Generator: bdist_wheel (0.37.1)
|
||||
Root-Is-Purelib: true
|
||||
Tag: py3-none-any
|
||||
|
||||
1
_vendor/requests-2.28.1.dist-info/top_level.txt
Normal file
1
_vendor/requests-2.28.1.dist-info/top_level.txt
Normal file
@@ -0,0 +1 @@
|
||||
requests
|
||||
180
_vendor/requests/__init__.py
Normal file
180
_vendor/requests/__init__.py
Normal file
@@ -0,0 +1,180 @@
|
||||
# __
|
||||
# /__) _ _ _ _ _/ _
|
||||
# / ( (- (/ (/ (- _) / _)
|
||||
# /
|
||||
|
||||
"""
|
||||
Requests HTTP Library
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
Requests is an HTTP library, written in Python, for human beings.
|
||||
Basic GET usage:
|
||||
|
||||
>>> import requests
|
||||
>>> r = requests.get('https://www.python.org')
|
||||
>>> r.status_code
|
||||
200
|
||||
>>> b'Python is a programming language' in r.content
|
||||
True
|
||||
|
||||
... or POST:
|
||||
|
||||
>>> payload = dict(key1='value1', key2='value2')
|
||||
>>> r = requests.post('https://httpbin.org/post', data=payload)
|
||||
>>> print(r.text)
|
||||
{
|
||||
...
|
||||
"form": {
|
||||
"key1": "value1",
|
||||
"key2": "value2"
|
||||
},
|
||||
...
|
||||
}
|
||||
|
||||
The other HTTP methods are supported - see `requests.api`. Full documentation
|
||||
is at <https://requests.readthedocs.io>.
|
||||
|
||||
:copyright: (c) 2017 by Kenneth Reitz.
|
||||
:license: Apache 2.0, see LICENSE for more details.
|
||||
"""
|
||||
|
||||
import warnings
|
||||
|
||||
import urllib3
|
||||
|
||||
from .exceptions import RequestsDependencyWarning
|
||||
|
||||
try:
|
||||
from charset_normalizer import __version__ as charset_normalizer_version
|
||||
except ImportError:
|
||||
charset_normalizer_version = None
|
||||
|
||||
try:
|
||||
from chardet import __version__ as chardet_version
|
||||
except ImportError:
|
||||
chardet_version = None
|
||||
|
||||
|
||||
def check_compatibility(urllib3_version, chardet_version, charset_normalizer_version):
|
||||
urllib3_version = urllib3_version.split(".")
|
||||
assert urllib3_version != ["dev"] # Verify urllib3 isn't installed from git.
|
||||
|
||||
# Sometimes, urllib3 only reports its version as 16.1.
|
||||
if len(urllib3_version) == 2:
|
||||
urllib3_version.append("0")
|
||||
|
||||
# Check urllib3 for compatibility.
|
||||
major, minor, patch = urllib3_version # noqa: F811
|
||||
major, minor, patch = int(major), int(minor), int(patch)
|
||||
# urllib3 >= 1.21.1, <= 1.26
|
||||
assert major == 1
|
||||
assert minor >= 21
|
||||
assert minor <= 26
|
||||
|
||||
# Check charset_normalizer for compatibility.
|
||||
if chardet_version:
|
||||
major, minor, patch = chardet_version.split(".")[:3]
|
||||
major, minor, patch = int(major), int(minor), int(patch)
|
||||
# chardet_version >= 3.0.2, < 6.0.0
|
||||
assert (3, 0, 2) <= (major, minor, patch) < (6, 0, 0)
|
||||
elif charset_normalizer_version:
|
||||
major, minor, patch = charset_normalizer_version.split(".")[:3]
|
||||
major, minor, patch = int(major), int(minor), int(patch)
|
||||
# charset_normalizer >= 2.0.0 < 3.0.0
|
||||
assert (2, 0, 0) <= (major, minor, patch) < (3, 0, 0)
|
||||
else:
|
||||
raise Exception("You need either charset_normalizer or chardet installed")
|
||||
|
||||
|
||||
def _check_cryptography(cryptography_version):
|
||||
# cryptography < 1.3.4
|
||||
try:
|
||||
cryptography_version = list(map(int, cryptography_version.split(".")))
|
||||
except ValueError:
|
||||
return
|
||||
|
||||
if cryptography_version < [1, 3, 4]:
|
||||
warning = "Old version of cryptography ({}) may cause slowdown.".format(
|
||||
cryptography_version
|
||||
)
|
||||
warnings.warn(warning, RequestsDependencyWarning)
|
||||
|
||||
|
||||
# Check imported dependencies for compatibility.
|
||||
try:
|
||||
check_compatibility(
|
||||
urllib3.__version__, chardet_version, charset_normalizer_version
|
||||
)
|
||||
except (AssertionError, ValueError):
|
||||
warnings.warn(
|
||||
"urllib3 ({}) or chardet ({})/charset_normalizer ({}) doesn't match a supported "
|
||||
"version!".format(
|
||||
urllib3.__version__, chardet_version, charset_normalizer_version
|
||||
),
|
||||
RequestsDependencyWarning,
|
||||
)
|
||||
|
||||
# Attempt to enable urllib3's fallback for SNI support
|
||||
# if the standard library doesn't support SNI or the
|
||||
# 'ssl' library isn't available.
|
||||
try:
|
||||
try:
|
||||
import ssl
|
||||
except ImportError:
|
||||
ssl = None
|
||||
|
||||
if not getattr(ssl, "HAS_SNI", False):
|
||||
from urllib3.contrib import pyopenssl
|
||||
|
||||
pyopenssl.inject_into_urllib3()
|
||||
|
||||
# Check cryptography version
|
||||
from cryptography import __version__ as cryptography_version
|
||||
|
||||
_check_cryptography(cryptography_version)
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
# urllib3's DependencyWarnings should be silenced.
|
||||
from urllib3.exceptions import DependencyWarning
|
||||
|
||||
warnings.simplefilter("ignore", DependencyWarning)
|
||||
|
||||
# Set default logging handler to avoid "No handler found" warnings.
|
||||
import logging
|
||||
from logging import NullHandler
|
||||
|
||||
from . import packages, utils
|
||||
from .__version__ import (
|
||||
__author__,
|
||||
__author_email__,
|
||||
__build__,
|
||||
__cake__,
|
||||
__copyright__,
|
||||
__description__,
|
||||
__license__,
|
||||
__title__,
|
||||
__url__,
|
||||
__version__,
|
||||
)
|
||||
from .api import delete, get, head, options, patch, post, put, request
|
||||
from .exceptions import (
|
||||
ConnectionError,
|
||||
ConnectTimeout,
|
||||
FileModeWarning,
|
||||
HTTPError,
|
||||
JSONDecodeError,
|
||||
ReadTimeout,
|
||||
RequestException,
|
||||
Timeout,
|
||||
TooManyRedirects,
|
||||
URLRequired,
|
||||
)
|
||||
from .models import PreparedRequest, Request, Response
|
||||
from .sessions import Session, session
|
||||
from .status_codes import codes
|
||||
|
||||
logging.getLogger(__name__).addHandler(NullHandler())
|
||||
|
||||
# FileModeWarnings go off per the default.
|
||||
warnings.simplefilter("default", FileModeWarning, append=True)
|
||||
BIN
_vendor/requests/__pycache__/__init__.cpython-310.pyc
Normal file
BIN
_vendor/requests/__pycache__/__init__.cpython-310.pyc
Normal file
Binary file not shown.
BIN
_vendor/requests/__pycache__/__version__.cpython-310.pyc
Normal file
BIN
_vendor/requests/__pycache__/__version__.cpython-310.pyc
Normal file
Binary file not shown.
BIN
_vendor/requests/__pycache__/_internal_utils.cpython-310.pyc
Normal file
BIN
_vendor/requests/__pycache__/_internal_utils.cpython-310.pyc
Normal file
Binary file not shown.
BIN
_vendor/requests/__pycache__/adapters.cpython-310.pyc
Normal file
BIN
_vendor/requests/__pycache__/adapters.cpython-310.pyc
Normal file
Binary file not shown.
BIN
_vendor/requests/__pycache__/api.cpython-310.pyc
Normal file
BIN
_vendor/requests/__pycache__/api.cpython-310.pyc
Normal file
Binary file not shown.
BIN
_vendor/requests/__pycache__/auth.cpython-310.pyc
Normal file
BIN
_vendor/requests/__pycache__/auth.cpython-310.pyc
Normal file
Binary file not shown.
BIN
_vendor/requests/__pycache__/certs.cpython-310.pyc
Normal file
BIN
_vendor/requests/__pycache__/certs.cpython-310.pyc
Normal file
Binary file not shown.
BIN
_vendor/requests/__pycache__/compat.cpython-310.pyc
Normal file
BIN
_vendor/requests/__pycache__/compat.cpython-310.pyc
Normal file
Binary file not shown.
BIN
_vendor/requests/__pycache__/cookies.cpython-310.pyc
Normal file
BIN
_vendor/requests/__pycache__/cookies.cpython-310.pyc
Normal file
Binary file not shown.
BIN
_vendor/requests/__pycache__/exceptions.cpython-310.pyc
Normal file
BIN
_vendor/requests/__pycache__/exceptions.cpython-310.pyc
Normal file
Binary file not shown.
BIN
_vendor/requests/__pycache__/help.cpython-310.pyc
Normal file
BIN
_vendor/requests/__pycache__/help.cpython-310.pyc
Normal file
Binary file not shown.
BIN
_vendor/requests/__pycache__/hooks.cpython-310.pyc
Normal file
BIN
_vendor/requests/__pycache__/hooks.cpython-310.pyc
Normal file
Binary file not shown.
BIN
_vendor/requests/__pycache__/models.cpython-310.pyc
Normal file
BIN
_vendor/requests/__pycache__/models.cpython-310.pyc
Normal file
Binary file not shown.
BIN
_vendor/requests/__pycache__/packages.cpython-310.pyc
Normal file
BIN
_vendor/requests/__pycache__/packages.cpython-310.pyc
Normal file
Binary file not shown.
BIN
_vendor/requests/__pycache__/sessions.cpython-310.pyc
Normal file
BIN
_vendor/requests/__pycache__/sessions.cpython-310.pyc
Normal file
Binary file not shown.
BIN
_vendor/requests/__pycache__/status_codes.cpython-310.pyc
Normal file
BIN
_vendor/requests/__pycache__/status_codes.cpython-310.pyc
Normal file
Binary file not shown.
BIN
_vendor/requests/__pycache__/structures.cpython-310.pyc
Normal file
BIN
_vendor/requests/__pycache__/structures.cpython-310.pyc
Normal file
Binary file not shown.
BIN
_vendor/requests/__pycache__/utils.cpython-310.pyc
Normal file
BIN
_vendor/requests/__pycache__/utils.cpython-310.pyc
Normal file
Binary file not shown.
14
_vendor/requests/__version__.py
Normal file
14
_vendor/requests/__version__.py
Normal file
@@ -0,0 +1,14 @@
|
||||
# .-. .-. .-. . . .-. .-. .-. .-.
|
||||
# |( |- |.| | | |- `-. | `-.
|
||||
# ' ' `-' `-`.`-' `-' `-' ' `-'
|
||||
|
||||
__title__ = "requests"
|
||||
__description__ = "Python HTTP for Humans."
|
||||
__url__ = "https://requests.readthedocs.io"
|
||||
__version__ = "2.28.1"
|
||||
__build__ = 0x022801
|
||||
__author__ = "Kenneth Reitz"
|
||||
__author_email__ = "me@kennethreitz.org"
|
||||
__license__ = "Apache 2.0"
|
||||
__copyright__ = "Copyright 2022 Kenneth Reitz"
|
||||
__cake__ = "\u2728 \U0001f370 \u2728"
|
||||
48
_vendor/requests/_internal_utils.py
Normal file
48
_vendor/requests/_internal_utils.py
Normal file
@@ -0,0 +1,48 @@
|
||||
"""
|
||||
requests._internal_utils
|
||||
~~~~~~~~~~~~~~
|
||||
|
||||
Provides utility functions that are consumed internally by Requests
|
||||
which depend on extremely few external helpers (such as compat)
|
||||
"""
|
||||
import re
|
||||
|
||||
from .compat import builtin_str
|
||||
|
||||
_VALID_HEADER_NAME_RE_BYTE = re.compile(rb"^[^:\s][^:\r\n]*$")
|
||||
_VALID_HEADER_NAME_RE_STR = re.compile(r"^[^:\s][^:\r\n]*$")
|
||||
_VALID_HEADER_VALUE_RE_BYTE = re.compile(rb"^\S[^\r\n]*$|^$")
|
||||
_VALID_HEADER_VALUE_RE_STR = re.compile(r"^\S[^\r\n]*$|^$")
|
||||
|
||||
HEADER_VALIDATORS = {
|
||||
bytes: (_VALID_HEADER_NAME_RE_BYTE, _VALID_HEADER_VALUE_RE_BYTE),
|
||||
str: (_VALID_HEADER_NAME_RE_STR, _VALID_HEADER_VALUE_RE_STR),
|
||||
}
|
||||
|
||||
|
||||
def to_native_string(string, encoding="ascii"):
|
||||
"""Given a string object, regardless of type, returns a representation of
|
||||
that string in the native string type, encoding and decoding where
|
||||
necessary. This assumes ASCII unless told otherwise.
|
||||
"""
|
||||
if isinstance(string, builtin_str):
|
||||
out = string
|
||||
else:
|
||||
out = string.decode(encoding)
|
||||
|
||||
return out
|
||||
|
||||
|
||||
def unicode_is_ascii(u_string):
|
||||
"""Determine if unicode string only contains ASCII characters.
|
||||
|
||||
:param str u_string: unicode string to check. Must be unicode
|
||||
and not Python 2 `str`.
|
||||
:rtype: bool
|
||||
"""
|
||||
assert isinstance(u_string, str)
|
||||
try:
|
||||
u_string.encode("ascii")
|
||||
return True
|
||||
except UnicodeEncodeError:
|
||||
return False
|
||||
584
_vendor/requests/adapters.py
Normal file
584
_vendor/requests/adapters.py
Normal file
@@ -0,0 +1,584 @@
|
||||
"""
|
||||
requests.adapters
|
||||
~~~~~~~~~~~~~~~~~
|
||||
|
||||
This module contains the transport adapters that Requests uses to define
|
||||
and maintain connections.
|
||||
"""
|
||||
|
||||
import os.path
|
||||
import socket # noqa: F401
|
||||
|
||||
from urllib3.exceptions import ClosedPoolError, ConnectTimeoutError
|
||||
from urllib3.exceptions import HTTPError as _HTTPError
|
||||
from urllib3.exceptions import InvalidHeader as _InvalidHeader
|
||||
from urllib3.exceptions import (
|
||||
LocationValueError,
|
||||
MaxRetryError,
|
||||
NewConnectionError,
|
||||
ProtocolError,
|
||||
)
|
||||
from urllib3.exceptions import ProxyError as _ProxyError
|
||||
from urllib3.exceptions import ReadTimeoutError, ResponseError
|
||||
from urllib3.exceptions import SSLError as _SSLError
|
||||
from urllib3.poolmanager import PoolManager, proxy_from_url
|
||||
from urllib3.response import HTTPResponse
|
||||
from urllib3.util import Timeout as TimeoutSauce
|
||||
from urllib3.util import parse_url
|
||||
from urllib3.util.retry import Retry
|
||||
|
||||
from .auth import _basic_auth_str
|
||||
from .compat import basestring, urlparse
|
||||
from .cookies import extract_cookies_to_jar
|
||||
from .exceptions import (
|
||||
ConnectionError,
|
||||
ConnectTimeout,
|
||||
InvalidHeader,
|
||||
InvalidProxyURL,
|
||||
InvalidSchema,
|
||||
InvalidURL,
|
||||
ProxyError,
|
||||
ReadTimeout,
|
||||
RetryError,
|
||||
SSLError,
|
||||
)
|
||||
from .models import Response
|
||||
from .structures import CaseInsensitiveDict
|
||||
from .utils import (
|
||||
DEFAULT_CA_BUNDLE_PATH,
|
||||
extract_zipped_paths,
|
||||
get_auth_from_url,
|
||||
get_encoding_from_headers,
|
||||
prepend_scheme_if_needed,
|
||||
select_proxy,
|
||||
urldefragauth,
|
||||
)
|
||||
|
||||
try:
|
||||
from urllib3.contrib.socks import SOCKSProxyManager
|
||||
except ImportError:
|
||||
|
||||
def SOCKSProxyManager(*args, **kwargs):
|
||||
raise InvalidSchema("Missing dependencies for SOCKS support.")
|
||||
|
||||
|
||||
DEFAULT_POOLBLOCK = False
|
||||
DEFAULT_POOLSIZE = 10
|
||||
DEFAULT_RETRIES = 0
|
||||
DEFAULT_POOL_TIMEOUT = None
|
||||
|
||||
|
||||
class BaseAdapter:
|
||||
"""The Base Transport Adapter"""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
|
||||
def send(
|
||||
self, request, stream=False, timeout=None, verify=True, cert=None, proxies=None
|
||||
):
|
||||
"""Sends PreparedRequest object. Returns Response object.
|
||||
|
||||
:param request: The :class:`PreparedRequest <PreparedRequest>` being sent.
|
||||
:param stream: (optional) Whether to stream the request content.
|
||||
:param timeout: (optional) How long to wait for the server to send
|
||||
data before giving up, as a float, or a :ref:`(connect timeout,
|
||||
read timeout) <timeouts>` tuple.
|
||||
:type timeout: float or tuple
|
||||
:param verify: (optional) Either a boolean, in which case it controls whether we verify
|
||||
the server's TLS certificate, or a string, in which case it must be a path
|
||||
to a CA bundle to use
|
||||
:param cert: (optional) Any user-provided SSL certificate to be trusted.
|
||||
:param proxies: (optional) The proxies dictionary to apply to the request.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def close(self):
|
||||
"""Cleans up adapter specific items."""
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
class HTTPAdapter(BaseAdapter):
|
||||
"""The built-in HTTP Adapter for urllib3.
|
||||
|
||||
Provides a general-case interface for Requests sessions to contact HTTP and
|
||||
HTTPS urls by implementing the Transport Adapter interface. This class will
|
||||
usually be created by the :class:`Session <Session>` class under the
|
||||
covers.
|
||||
|
||||
:param pool_connections: The number of urllib3 connection pools to cache.
|
||||
:param pool_maxsize: The maximum number of connections to save in the pool.
|
||||
:param max_retries: The maximum number of retries each connection
|
||||
should attempt. Note, this applies only to failed DNS lookups, socket
|
||||
connections and connection timeouts, never to requests where data has
|
||||
made it to the server. By default, Requests does not retry failed
|
||||
connections. If you need granular control over the conditions under
|
||||
which we retry a request, import urllib3's ``Retry`` class and pass
|
||||
that instead.
|
||||
:param pool_block: Whether the connection pool should block for connections.
|
||||
|
||||
Usage::
|
||||
|
||||
>>> import requests
|
||||
>>> s = requests.Session()
|
||||
>>> a = requests.adapters.HTTPAdapter(max_retries=3)
|
||||
>>> s.mount('http://', a)
|
||||
"""
|
||||
|
||||
__attrs__ = [
|
||||
"max_retries",
|
||||
"config",
|
||||
"_pool_connections",
|
||||
"_pool_maxsize",
|
||||
"_pool_block",
|
||||
]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
pool_connections=DEFAULT_POOLSIZE,
|
||||
pool_maxsize=DEFAULT_POOLSIZE,
|
||||
max_retries=DEFAULT_RETRIES,
|
||||
pool_block=DEFAULT_POOLBLOCK,
|
||||
):
|
||||
if max_retries == DEFAULT_RETRIES:
|
||||
self.max_retries = Retry(0, read=False)
|
||||
else:
|
||||
self.max_retries = Retry.from_int(max_retries)
|
||||
self.config = {}
|
||||
self.proxy_manager = {}
|
||||
|
||||
super().__init__()
|
||||
|
||||
self._pool_connections = pool_connections
|
||||
self._pool_maxsize = pool_maxsize
|
||||
self._pool_block = pool_block
|
||||
|
||||
self.init_poolmanager(pool_connections, pool_maxsize, block=pool_block)
|
||||
|
||||
def __getstate__(self):
|
||||
return {attr: getattr(self, attr, None) for attr in self.__attrs__}
|
||||
|
||||
def __setstate__(self, state):
|
||||
# Can't handle by adding 'proxy_manager' to self.__attrs__ because
|
||||
# self.poolmanager uses a lambda function, which isn't pickleable.
|
||||
self.proxy_manager = {}
|
||||
self.config = {}
|
||||
|
||||
for attr, value in state.items():
|
||||
setattr(self, attr, value)
|
||||
|
||||
self.init_poolmanager(
|
||||
self._pool_connections, self._pool_maxsize, block=self._pool_block
|
||||
)
|
||||
|
||||
def init_poolmanager(
|
||||
self, connections, maxsize, block=DEFAULT_POOLBLOCK, **pool_kwargs
|
||||
):
|
||||
"""Initializes a urllib3 PoolManager.
|
||||
|
||||
This method should not be called from user code, and is only
|
||||
exposed for use when subclassing the
|
||||
:class:`HTTPAdapter <requests.adapters.HTTPAdapter>`.
|
||||
|
||||
:param connections: The number of urllib3 connection pools to cache.
|
||||
:param maxsize: The maximum number of connections to save in the pool.
|
||||
:param block: Block when no free connections are available.
|
||||
:param pool_kwargs: Extra keyword arguments used to initialize the Pool Manager.
|
||||
"""
|
||||
# save these values for pickling
|
||||
self._pool_connections = connections
|
||||
self._pool_maxsize = maxsize
|
||||
self._pool_block = block
|
||||
|
||||
self.poolmanager = PoolManager(
|
||||
num_pools=connections,
|
||||
maxsize=maxsize,
|
||||
block=block,
|
||||
strict=True,
|
||||
**pool_kwargs,
|
||||
)
|
||||
|
||||
def proxy_manager_for(self, proxy, **proxy_kwargs):
|
||||
"""Return urllib3 ProxyManager for the given proxy.
|
||||
|
||||
This method should not be called from user code, and is only
|
||||
exposed for use when subclassing the
|
||||
:class:`HTTPAdapter <requests.adapters.HTTPAdapter>`.
|
||||
|
||||
:param proxy: The proxy to return a urllib3 ProxyManager for.
|
||||
:param proxy_kwargs: Extra keyword arguments used to configure the Proxy Manager.
|
||||
:returns: ProxyManager
|
||||
:rtype: urllib3.ProxyManager
|
||||
"""
|
||||
if proxy in self.proxy_manager:
|
||||
manager = self.proxy_manager[proxy]
|
||||
elif proxy.lower().startswith("socks"):
|
||||
username, password = get_auth_from_url(proxy)
|
||||
manager = self.proxy_manager[proxy] = SOCKSProxyManager(
|
||||
proxy,
|
||||
username=username,
|
||||
password=password,
|
||||
num_pools=self._pool_connections,
|
||||
maxsize=self._pool_maxsize,
|
||||
block=self._pool_block,
|
||||
**proxy_kwargs,
|
||||
)
|
||||
else:
|
||||
proxy_headers = self.proxy_headers(proxy)
|
||||
manager = self.proxy_manager[proxy] = proxy_from_url(
|
||||
proxy,
|
||||
proxy_headers=proxy_headers,
|
||||
num_pools=self._pool_connections,
|
||||
maxsize=self._pool_maxsize,
|
||||
block=self._pool_block,
|
||||
**proxy_kwargs,
|
||||
)
|
||||
|
||||
return manager
|
||||
|
||||
def cert_verify(self, conn, url, verify, cert):
|
||||
"""Verify a SSL certificate. This method should not be called from user
|
||||
code, and is only exposed for use when subclassing the
|
||||
:class:`HTTPAdapter <requests.adapters.HTTPAdapter>`.
|
||||
|
||||
:param conn: The urllib3 connection object associated with the cert.
|
||||
:param url: The requested URL.
|
||||
:param verify: Either a boolean, in which case it controls whether we verify
|
||||
the server's TLS certificate, or a string, in which case it must be a path
|
||||
to a CA bundle to use
|
||||
:param cert: The SSL certificate to verify.
|
||||
"""
|
||||
if url.lower().startswith("https") and verify:
|
||||
|
||||
cert_loc = None
|
||||
|
||||
# Allow self-specified cert location.
|
||||
if verify is not True:
|
||||
cert_loc = verify
|
||||
|
||||
if not cert_loc:
|
||||
cert_loc = extract_zipped_paths(DEFAULT_CA_BUNDLE_PATH)
|
||||
|
||||
if not cert_loc or not os.path.exists(cert_loc):
|
||||
raise OSError(
|
||||
f"Could not find a suitable TLS CA certificate bundle, "
|
||||
f"invalid path: {cert_loc}"
|
||||
)
|
||||
|
||||
conn.cert_reqs = "CERT_REQUIRED"
|
||||
|
||||
if not os.path.isdir(cert_loc):
|
||||
conn.ca_certs = cert_loc
|
||||
else:
|
||||
conn.ca_cert_dir = cert_loc
|
||||
else:
|
||||
conn.cert_reqs = "CERT_NONE"
|
||||
conn.ca_certs = None
|
||||
conn.ca_cert_dir = None
|
||||
|
||||
if cert:
|
||||
if not isinstance(cert, basestring):
|
||||
conn.cert_file = cert[0]
|
||||
conn.key_file = cert[1]
|
||||
else:
|
||||
conn.cert_file = cert
|
||||
conn.key_file = None
|
||||
if conn.cert_file and not os.path.exists(conn.cert_file):
|
||||
raise OSError(
|
||||
f"Could not find the TLS certificate file, "
|
||||
f"invalid path: {conn.cert_file}"
|
||||
)
|
||||
if conn.key_file and not os.path.exists(conn.key_file):
|
||||
raise OSError(
|
||||
f"Could not find the TLS key file, invalid path: {conn.key_file}"
|
||||
)
|
||||
|
||||
def build_response(self, req, resp):
|
||||
"""Builds a :class:`Response <requests.Response>` object from a urllib3
|
||||
response. This should not be called from user code, and is only exposed
|
||||
for use when subclassing the
|
||||
:class:`HTTPAdapter <requests.adapters.HTTPAdapter>`
|
||||
|
||||
:param req: The :class:`PreparedRequest <PreparedRequest>` used to generate the response.
|
||||
:param resp: The urllib3 response object.
|
||||
:rtype: requests.Response
|
||||
"""
|
||||
response = Response()
|
||||
|
||||
# Fallback to None if there's no status_code, for whatever reason.
|
||||
response.status_code = getattr(resp, "status", None)
|
||||
|
||||
# Make headers case-insensitive.
|
||||
response.headers = CaseInsensitiveDict(getattr(resp, "headers", {}))
|
||||
|
||||
# Set encoding.
|
||||
response.encoding = get_encoding_from_headers(response.headers)
|
||||
response.raw = resp
|
||||
response.reason = response.raw.reason
|
||||
|
||||
if isinstance(req.url, bytes):
|
||||
response.url = req.url.decode("utf-8")
|
||||
else:
|
||||
response.url = req.url
|
||||
|
||||
# Add new cookies from the server.
|
||||
extract_cookies_to_jar(response.cookies, req, resp)
|
||||
|
||||
# Give the Response some context.
|
||||
response.request = req
|
||||
response.connection = self
|
||||
|
||||
return response
|
||||
|
||||
def get_connection(self, url, proxies=None):
|
||||
"""Returns a urllib3 connection for the given URL. This should not be
|
||||
called from user code, and is only exposed for use when subclassing the
|
||||
:class:`HTTPAdapter <requests.adapters.HTTPAdapter>`.
|
||||
|
||||
:param url: The URL to connect to.
|
||||
:param proxies: (optional) A Requests-style dictionary of proxies used on this request.
|
||||
:rtype: urllib3.ConnectionPool
|
||||
"""
|
||||
proxy = select_proxy(url, proxies)
|
||||
|
||||
if proxy:
|
||||
proxy = prepend_scheme_if_needed(proxy, "http")
|
||||
proxy_url = parse_url(proxy)
|
||||
if not proxy_url.host:
|
||||
raise InvalidProxyURL(
|
||||
"Please check proxy URL. It is malformed "
|
||||
"and could be missing the host."
|
||||
)
|
||||
proxy_manager = self.proxy_manager_for(proxy)
|
||||
conn = proxy_manager.connection_from_url(url)
|
||||
else:
|
||||
# Only scheme should be lower case
|
||||
parsed = urlparse(url)
|
||||
url = parsed.geturl()
|
||||
conn = self.poolmanager.connection_from_url(url)
|
||||
|
||||
return conn
|
||||
|
||||
def close(self):
|
||||
"""Disposes of any internal state.
|
||||
|
||||
Currently, this closes the PoolManager and any active ProxyManager,
|
||||
which closes any pooled connections.
|
||||
"""
|
||||
self.poolmanager.clear()
|
||||
for proxy in self.proxy_manager.values():
|
||||
proxy.clear()
|
||||
|
||||
def request_url(self, request, proxies):
|
||||
"""Obtain the url to use when making the final request.
|
||||
|
||||
If the message is being sent through a HTTP proxy, the full URL has to
|
||||
be used. Otherwise, we should only use the path portion of the URL.
|
||||
|
||||
This should not be called from user code, and is only exposed for use
|
||||
when subclassing the
|
||||
:class:`HTTPAdapter <requests.adapters.HTTPAdapter>`.
|
||||
|
||||
:param request: The :class:`PreparedRequest <PreparedRequest>` being sent.
|
||||
:param proxies: A dictionary of schemes or schemes and hosts to proxy URLs.
|
||||
:rtype: str
|
||||
"""
|
||||
proxy = select_proxy(request.url, proxies)
|
||||
scheme = urlparse(request.url).scheme
|
||||
|
||||
is_proxied_http_request = proxy and scheme != "https"
|
||||
using_socks_proxy = False
|
||||
if proxy:
|
||||
proxy_scheme = urlparse(proxy).scheme.lower()
|
||||
using_socks_proxy = proxy_scheme.startswith("socks")
|
||||
|
||||
url = request.path_url
|
||||
if is_proxied_http_request and not using_socks_proxy:
|
||||
url = urldefragauth(request.url)
|
||||
|
||||
return url
|
||||
|
||||
def add_headers(self, request, **kwargs):
|
||||
"""Add any headers needed by the connection. As of v2.0 this does
|
||||
nothing by default, but is left for overriding by users that subclass
|
||||
the :class:`HTTPAdapter <requests.adapters.HTTPAdapter>`.
|
||||
|
||||
This should not be called from user code, and is only exposed for use
|
||||
when subclassing the
|
||||
:class:`HTTPAdapter <requests.adapters.HTTPAdapter>`.
|
||||
|
||||
:param request: The :class:`PreparedRequest <PreparedRequest>` to add headers to.
|
||||
:param kwargs: The keyword arguments from the call to send().
|
||||
"""
|
||||
pass
|
||||
|
||||
def proxy_headers(self, proxy):
|
||||
"""Returns a dictionary of the headers to add to any request sent
|
||||
through a proxy. This works with urllib3 magic to ensure that they are
|
||||
correctly sent to the proxy, rather than in a tunnelled request if
|
||||
CONNECT is being used.
|
||||
|
||||
This should not be called from user code, and is only exposed for use
|
||||
when subclassing the
|
||||
:class:`HTTPAdapter <requests.adapters.HTTPAdapter>`.
|
||||
|
||||
:param proxy: The url of the proxy being used for this request.
|
||||
:rtype: dict
|
||||
"""
|
||||
headers = {}
|
||||
username, password = get_auth_from_url(proxy)
|
||||
|
||||
if username:
|
||||
headers["Proxy-Authorization"] = _basic_auth_str(username, password)
|
||||
|
||||
return headers
|
||||
|
||||
def send(
|
||||
self, request, stream=False, timeout=None, verify=True, cert=None, proxies=None
|
||||
):
|
||||
"""Sends PreparedRequest object. Returns Response object.
|
||||
|
||||
:param request: The :class:`PreparedRequest <PreparedRequest>` being sent.
|
||||
:param stream: (optional) Whether to stream the request content.
|
||||
:param timeout: (optional) How long to wait for the server to send
|
||||
data before giving up, as a float, or a :ref:`(connect timeout,
|
||||
read timeout) <timeouts>` tuple.
|
||||
:type timeout: float or tuple or urllib3 Timeout object
|
||||
:param verify: (optional) Either a boolean, in which case it controls whether
|
||||
we verify the server's TLS certificate, or a string, in which case it
|
||||
must be a path to a CA bundle to use
|
||||
:param cert: (optional) Any user-provided SSL certificate to be trusted.
|
||||
:param proxies: (optional) The proxies dictionary to apply to the request.
|
||||
:rtype: requests.Response
|
||||
"""
|
||||
|
||||
try:
|
||||
conn = self.get_connection(request.url, proxies)
|
||||
except LocationValueError as e:
|
||||
raise InvalidURL(e, request=request)
|
||||
|
||||
self.cert_verify(conn, request.url, verify, cert)
|
||||
url = self.request_url(request, proxies)
|
||||
self.add_headers(
|
||||
request,
|
||||
stream=stream,
|
||||
timeout=timeout,
|
||||
verify=verify,
|
||||
cert=cert,
|
||||
proxies=proxies,
|
||||
)
|
||||
|
||||
chunked = not (request.body is None or "Content-Length" in request.headers)
|
||||
|
||||
if isinstance(timeout, tuple):
|
||||
try:
|
||||
connect, read = timeout
|
||||
timeout = TimeoutSauce(connect=connect, read=read)
|
||||
except ValueError:
|
||||
raise ValueError(
|
||||
f"Invalid timeout {timeout}. Pass a (connect, read) timeout tuple, "
|
||||
f"or a single float to set both timeouts to the same value."
|
||||
)
|
||||
elif isinstance(timeout, TimeoutSauce):
|
||||
pass
|
||||
else:
|
||||
timeout = TimeoutSauce(connect=timeout, read=timeout)
|
||||
|
||||
try:
|
||||
if not chunked:
|
||||
resp = conn.urlopen(
|
||||
method=request.method,
|
||||
url=url,
|
||||
body=request.body,
|
||||
headers=request.headers,
|
||||
redirect=False,
|
||||
assert_same_host=False,
|
||||
preload_content=False,
|
||||
decode_content=False,
|
||||
retries=self.max_retries,
|
||||
timeout=timeout,
|
||||
)
|
||||
|
||||
# Send the request.
|
||||
else:
|
||||
if hasattr(conn, "proxy_pool"):
|
||||
conn = conn.proxy_pool
|
||||
|
||||
low_conn = conn._get_conn(timeout=DEFAULT_POOL_TIMEOUT)
|
||||
|
||||
try:
|
||||
skip_host = "Host" in request.headers
|
||||
low_conn.putrequest(
|
||||
request.method,
|
||||
url,
|
||||
skip_accept_encoding=True,
|
||||
skip_host=skip_host,
|
||||
)
|
||||
|
||||
for header, value in request.headers.items():
|
||||
low_conn.putheader(header, value)
|
||||
|
||||
low_conn.endheaders()
|
||||
|
||||
for i in request.body:
|
||||
low_conn.send(hex(len(i))[2:].encode("utf-8"))
|
||||
low_conn.send(b"\r\n")
|
||||
low_conn.send(i)
|
||||
low_conn.send(b"\r\n")
|
||||
low_conn.send(b"0\r\n\r\n")
|
||||
|
||||
# Receive the response from the server
|
||||
r = low_conn.getresponse()
|
||||
|
||||
resp = HTTPResponse.from_httplib(
|
||||
r,
|
||||
pool=conn,
|
||||
connection=low_conn,
|
||||
preload_content=False,
|
||||
decode_content=False,
|
||||
)
|
||||
except Exception:
|
||||
# If we hit any problems here, clean up the connection.
|
||||
# Then, raise so that we can handle the actual exception.
|
||||
low_conn.close()
|
||||
raise
|
||||
|
||||
except (ProtocolError, OSError) as err:
|
||||
raise ConnectionError(err, request=request)
|
||||
|
||||
except MaxRetryError as e:
|
||||
if isinstance(e.reason, ConnectTimeoutError):
|
||||
# TODO: Remove this in 3.0.0: see #2811
|
||||
if not isinstance(e.reason, NewConnectionError):
|
||||
raise ConnectTimeout(e, request=request)
|
||||
|
||||
if isinstance(e.reason, ResponseError):
|
||||
raise RetryError(e, request=request)
|
||||
|
||||
if isinstance(e.reason, _ProxyError):
|
||||
raise ProxyError(e, request=request)
|
||||
|
||||
if isinstance(e.reason, _SSLError):
|
||||
# This branch is for urllib3 v1.22 and later.
|
||||
raise SSLError(e, request=request)
|
||||
|
||||
raise ConnectionError(e, request=request)
|
||||
|
||||
except ClosedPoolError as e:
|
||||
raise ConnectionError(e, request=request)
|
||||
|
||||
except _ProxyError as e:
|
||||
raise ProxyError(e)
|
||||
|
||||
except (_SSLError, _HTTPError) as e:
|
||||
if isinstance(e, _SSLError):
|
||||
# This branch is for urllib3 versions earlier than v1.22
|
||||
raise SSLError(e, request=request)
|
||||
elif isinstance(e, ReadTimeoutError):
|
||||
raise ReadTimeout(e, request=request)
|
||||
elif isinstance(e, _InvalidHeader):
|
||||
raise InvalidHeader(e, request=request)
|
||||
else:
|
||||
raise
|
||||
|
||||
return self.build_response(request, resp)
|
||||
157
_vendor/requests/api.py
Normal file
157
_vendor/requests/api.py
Normal file
@@ -0,0 +1,157 @@
|
||||
"""
|
||||
requests.api
|
||||
~~~~~~~~~~~~
|
||||
|
||||
This module implements the Requests API.
|
||||
|
||||
:copyright: (c) 2012 by Kenneth Reitz.
|
||||
:license: Apache2, see LICENSE for more details.
|
||||
"""
|
||||
|
||||
from . import sessions
|
||||
|
||||
|
||||
def request(method, url, **kwargs):
|
||||
"""Constructs and sends a :class:`Request <Request>`.
|
||||
|
||||
:param method: method for the new :class:`Request` object: ``GET``, ``OPTIONS``, ``HEAD``, ``POST``, ``PUT``, ``PATCH``, or ``DELETE``.
|
||||
:param url: URL for the new :class:`Request` object.
|
||||
:param params: (optional) Dictionary, list of tuples or bytes to send
|
||||
in the query string for the :class:`Request`.
|
||||
:param data: (optional) Dictionary, list of tuples, bytes, or file-like
|
||||
object to send in the body of the :class:`Request`.
|
||||
:param json: (optional) A JSON serializable Python object to send in the body of the :class:`Request`.
|
||||
:param headers: (optional) Dictionary of HTTP Headers to send with the :class:`Request`.
|
||||
:param cookies: (optional) Dict or CookieJar object to send with the :class:`Request`.
|
||||
:param files: (optional) Dictionary of ``'name': file-like-objects`` (or ``{'name': file-tuple}``) for multipart encoding upload.
|
||||
``file-tuple`` can be a 2-tuple ``('filename', fileobj)``, 3-tuple ``('filename', fileobj, 'content_type')``
|
||||
or a 4-tuple ``('filename', fileobj, 'content_type', custom_headers)``, where ``'content-type'`` is a string
|
||||
defining the content type of the given file and ``custom_headers`` a dict-like object containing additional headers
|
||||
to add for the file.
|
||||
:param auth: (optional) Auth tuple to enable Basic/Digest/Custom HTTP Auth.
|
||||
:param timeout: (optional) How many seconds to wait for the server to send data
|
||||
before giving up, as a float, or a :ref:`(connect timeout, read
|
||||
timeout) <timeouts>` tuple.
|
||||
:type timeout: float or tuple
|
||||
:param allow_redirects: (optional) Boolean. Enable/disable GET/OPTIONS/POST/PUT/PATCH/DELETE/HEAD redirection. Defaults to ``True``.
|
||||
:type allow_redirects: bool
|
||||
:param proxies: (optional) Dictionary mapping protocol to the URL of the proxy.
|
||||
:param verify: (optional) Either a boolean, in which case it controls whether we verify
|
||||
the server's TLS certificate, or a string, in which case it must be a path
|
||||
to a CA bundle to use. Defaults to ``True``.
|
||||
:param stream: (optional) if ``False``, the response content will be immediately downloaded.
|
||||
:param cert: (optional) if String, path to ssl client cert file (.pem). If Tuple, ('cert', 'key') pair.
|
||||
:return: :class:`Response <Response>` object
|
||||
:rtype: requests.Response
|
||||
|
||||
Usage::
|
||||
|
||||
>>> import requests
|
||||
>>> req = requests.request('GET', 'https://httpbin.org/get')
|
||||
>>> req
|
||||
<Response [200]>
|
||||
"""
|
||||
|
||||
# By using the 'with' statement we are sure the session is closed, thus we
|
||||
# avoid leaving sockets open which can trigger a ResourceWarning in some
|
||||
# cases, and look like a memory leak in others.
|
||||
with sessions.Session() as session:
|
||||
return session.request(method=method, url=url, **kwargs)
|
||||
|
||||
|
||||
def get(url, params=None, **kwargs):
|
||||
r"""Sends a GET request.
|
||||
|
||||
:param url: URL for the new :class:`Request` object.
|
||||
:param params: (optional) Dictionary, list of tuples or bytes to send
|
||||
in the query string for the :class:`Request`.
|
||||
:param \*\*kwargs: Optional arguments that ``request`` takes.
|
||||
:return: :class:`Response <Response>` object
|
||||
:rtype: requests.Response
|
||||
"""
|
||||
|
||||
return request("get", url, params=params, **kwargs)
|
||||
|
||||
|
||||
def options(url, **kwargs):
|
||||
r"""Sends an OPTIONS request.
|
||||
|
||||
:param url: URL for the new :class:`Request` object.
|
||||
:param \*\*kwargs: Optional arguments that ``request`` takes.
|
||||
:return: :class:`Response <Response>` object
|
||||
:rtype: requests.Response
|
||||
"""
|
||||
|
||||
return request("options", url, **kwargs)
|
||||
|
||||
|
||||
def head(url, **kwargs):
|
||||
r"""Sends a HEAD request.
|
||||
|
||||
:param url: URL for the new :class:`Request` object.
|
||||
:param \*\*kwargs: Optional arguments that ``request`` takes. If
|
||||
`allow_redirects` is not provided, it will be set to `False` (as
|
||||
opposed to the default :meth:`request` behavior).
|
||||
:return: :class:`Response <Response>` object
|
||||
:rtype: requests.Response
|
||||
"""
|
||||
|
||||
kwargs.setdefault("allow_redirects", False)
|
||||
return request("head", url, **kwargs)
|
||||
|
||||
|
||||
def post(url, data=None, json=None, **kwargs):
|
||||
r"""Sends a POST request.
|
||||
|
||||
:param url: URL for the new :class:`Request` object.
|
||||
:param data: (optional) Dictionary, list of tuples, bytes, or file-like
|
||||
object to send in the body of the :class:`Request`.
|
||||
:param json: (optional) json data to send in the body of the :class:`Request`.
|
||||
:param \*\*kwargs: Optional arguments that ``request`` takes.
|
||||
:return: :class:`Response <Response>` object
|
||||
:rtype: requests.Response
|
||||
"""
|
||||
|
||||
return request("post", url, data=data, json=json, **kwargs)
|
||||
|
||||
|
||||
def put(url, data=None, **kwargs):
|
||||
r"""Sends a PUT request.
|
||||
|
||||
:param url: URL for the new :class:`Request` object.
|
||||
:param data: (optional) Dictionary, list of tuples, bytes, or file-like
|
||||
object to send in the body of the :class:`Request`.
|
||||
:param json: (optional) json data to send in the body of the :class:`Request`.
|
||||
:param \*\*kwargs: Optional arguments that ``request`` takes.
|
||||
:return: :class:`Response <Response>` object
|
||||
:rtype: requests.Response
|
||||
"""
|
||||
|
||||
return request("put", url, data=data, **kwargs)
|
||||
|
||||
|
||||
def patch(url, data=None, **kwargs):
|
||||
r"""Sends a PATCH request.
|
||||
|
||||
:param url: URL for the new :class:`Request` object.
|
||||
:param data: (optional) Dictionary, list of tuples, bytes, or file-like
|
||||
object to send in the body of the :class:`Request`.
|
||||
:param json: (optional) json data to send in the body of the :class:`Request`.
|
||||
:param \*\*kwargs: Optional arguments that ``request`` takes.
|
||||
:return: :class:`Response <Response>` object
|
||||
:rtype: requests.Response
|
||||
"""
|
||||
|
||||
return request("patch", url, data=data, **kwargs)
|
||||
|
||||
|
||||
def delete(url, **kwargs):
|
||||
r"""Sends a DELETE request.
|
||||
|
||||
:param url: URL for the new :class:`Request` object.
|
||||
:param \*\*kwargs: Optional arguments that ``request`` takes.
|
||||
:return: :class:`Response <Response>` object
|
||||
:rtype: requests.Response
|
||||
"""
|
||||
|
||||
return request("delete", url, **kwargs)
|
||||
315
_vendor/requests/auth.py
Normal file
315
_vendor/requests/auth.py
Normal file
@@ -0,0 +1,315 @@
|
||||
"""
|
||||
requests.auth
|
||||
~~~~~~~~~~~~~
|
||||
|
||||
This module contains the authentication handlers for Requests.
|
||||
"""
|
||||
|
||||
import hashlib
|
||||
import os
|
||||
import re
|
||||
import threading
|
||||
import time
|
||||
import warnings
|
||||
from base64 import b64encode
|
||||
|
||||
from ._internal_utils import to_native_string
|
||||
from .compat import basestring, str, urlparse
|
||||
from .cookies import extract_cookies_to_jar
|
||||
from .utils import parse_dict_header
|
||||
|
||||
CONTENT_TYPE_FORM_URLENCODED = "application/x-www-form-urlencoded"
|
||||
CONTENT_TYPE_MULTI_PART = "multipart/form-data"
|
||||
|
||||
|
||||
def _basic_auth_str(username, password):
|
||||
"""Returns a Basic Auth string."""
|
||||
|
||||
# "I want us to put a big-ol' comment on top of it that
|
||||
# says that this behaviour is dumb but we need to preserve
|
||||
# it because people are relying on it."
|
||||
# - Lukasa
|
||||
#
|
||||
# These are here solely to maintain backwards compatibility
|
||||
# for things like ints. This will be removed in 3.0.0.
|
||||
if not isinstance(username, basestring):
|
||||
warnings.warn(
|
||||
"Non-string usernames will no longer be supported in Requests "
|
||||
"3.0.0. Please convert the object you've passed in ({!r}) to "
|
||||
"a string or bytes object in the near future to avoid "
|
||||
"problems.".format(username),
|
||||
category=DeprecationWarning,
|
||||
)
|
||||
username = str(username)
|
||||
|
||||
if not isinstance(password, basestring):
|
||||
warnings.warn(
|
||||
"Non-string passwords will no longer be supported in Requests "
|
||||
"3.0.0. Please convert the object you've passed in ({!r}) to "
|
||||
"a string or bytes object in the near future to avoid "
|
||||
"problems.".format(type(password)),
|
||||
category=DeprecationWarning,
|
||||
)
|
||||
password = str(password)
|
||||
# -- End Removal --
|
||||
|
||||
if isinstance(username, str):
|
||||
username = username.encode("latin1")
|
||||
|
||||
if isinstance(password, str):
|
||||
password = password.encode("latin1")
|
||||
|
||||
authstr = "Basic " + to_native_string(
|
||||
b64encode(b":".join((username, password))).strip()
|
||||
)
|
||||
|
||||
return authstr
|
||||
|
||||
|
||||
class AuthBase:
|
||||
"""Base class that all auth implementations derive from"""
|
||||
|
||||
def __call__(self, r):
|
||||
raise NotImplementedError("Auth hooks must be callable.")
|
||||
|
||||
|
||||
class HTTPBasicAuth(AuthBase):
|
||||
"""Attaches HTTP Basic Authentication to the given Request object."""
|
||||
|
||||
def __init__(self, username, password):
|
||||
self.username = username
|
||||
self.password = password
|
||||
|
||||
def __eq__(self, other):
|
||||
return all(
|
||||
[
|
||||
self.username == getattr(other, "username", None),
|
||||
self.password == getattr(other, "password", None),
|
||||
]
|
||||
)
|
||||
|
||||
def __ne__(self, other):
|
||||
return not self == other
|
||||
|
||||
def __call__(self, r):
|
||||
r.headers["Authorization"] = _basic_auth_str(self.username, self.password)
|
||||
return r
|
||||
|
||||
|
||||
class HTTPProxyAuth(HTTPBasicAuth):
|
||||
"""Attaches HTTP Proxy Authentication to a given Request object."""
|
||||
|
||||
def __call__(self, r):
|
||||
r.headers["Proxy-Authorization"] = _basic_auth_str(self.username, self.password)
|
||||
return r
|
||||
|
||||
|
||||
class HTTPDigestAuth(AuthBase):
|
||||
"""Attaches HTTP Digest Authentication to the given Request object."""
|
||||
|
||||
def __init__(self, username, password):
|
||||
self.username = username
|
||||
self.password = password
|
||||
# Keep state in per-thread local storage
|
||||
self._thread_local = threading.local()
|
||||
|
||||
def init_per_thread_state(self):
|
||||
# Ensure state is initialized just once per-thread
|
||||
if not hasattr(self._thread_local, "init"):
|
||||
self._thread_local.init = True
|
||||
self._thread_local.last_nonce = ""
|
||||
self._thread_local.nonce_count = 0
|
||||
self._thread_local.chal = {}
|
||||
self._thread_local.pos = None
|
||||
self._thread_local.num_401_calls = None
|
||||
|
||||
def build_digest_header(self, method, url):
|
||||
"""
|
||||
:rtype: str
|
||||
"""
|
||||
|
||||
realm = self._thread_local.chal["realm"]
|
||||
nonce = self._thread_local.chal["nonce"]
|
||||
qop = self._thread_local.chal.get("qop")
|
||||
algorithm = self._thread_local.chal.get("algorithm")
|
||||
opaque = self._thread_local.chal.get("opaque")
|
||||
hash_utf8 = None
|
||||
|
||||
if algorithm is None:
|
||||
_algorithm = "MD5"
|
||||
else:
|
||||
_algorithm = algorithm.upper()
|
||||
# lambdas assume digest modules are imported at the top level
|
||||
if _algorithm == "MD5" or _algorithm == "MD5-SESS":
|
||||
|
||||
def md5_utf8(x):
|
||||
if isinstance(x, str):
|
||||
x = x.encode("utf-8")
|
||||
return hashlib.md5(x).hexdigest()
|
||||
|
||||
hash_utf8 = md5_utf8
|
||||
elif _algorithm == "SHA":
|
||||
|
||||
def sha_utf8(x):
|
||||
if isinstance(x, str):
|
||||
x = x.encode("utf-8")
|
||||
return hashlib.sha1(x).hexdigest()
|
||||
|
||||
hash_utf8 = sha_utf8
|
||||
elif _algorithm == "SHA-256":
|
||||
|
||||
def sha256_utf8(x):
|
||||
if isinstance(x, str):
|
||||
x = x.encode("utf-8")
|
||||
return hashlib.sha256(x).hexdigest()
|
||||
|
||||
hash_utf8 = sha256_utf8
|
||||
elif _algorithm == "SHA-512":
|
||||
|
||||
def sha512_utf8(x):
|
||||
if isinstance(x, str):
|
||||
x = x.encode("utf-8")
|
||||
return hashlib.sha512(x).hexdigest()
|
||||
|
||||
hash_utf8 = sha512_utf8
|
||||
|
||||
KD = lambda s, d: hash_utf8(f"{s}:{d}") # noqa:E731
|
||||
|
||||
if hash_utf8 is None:
|
||||
return None
|
||||
|
||||
# XXX not implemented yet
|
||||
entdig = None
|
||||
p_parsed = urlparse(url)
|
||||
#: path is request-uri defined in RFC 2616 which should not be empty
|
||||
path = p_parsed.path or "/"
|
||||
if p_parsed.query:
|
||||
path += f"?{p_parsed.query}"
|
||||
|
||||
A1 = f"{self.username}:{realm}:{self.password}"
|
||||
A2 = f"{method}:{path}"
|
||||
|
||||
HA1 = hash_utf8(A1)
|
||||
HA2 = hash_utf8(A2)
|
||||
|
||||
if nonce == self._thread_local.last_nonce:
|
||||
self._thread_local.nonce_count += 1
|
||||
else:
|
||||
self._thread_local.nonce_count = 1
|
||||
ncvalue = f"{self._thread_local.nonce_count:08x}"
|
||||
s = str(self._thread_local.nonce_count).encode("utf-8")
|
||||
s += nonce.encode("utf-8")
|
||||
s += time.ctime().encode("utf-8")
|
||||
s += os.urandom(8)
|
||||
|
||||
cnonce = hashlib.sha1(s).hexdigest()[:16]
|
||||
if _algorithm == "MD5-SESS":
|
||||
HA1 = hash_utf8(f"{HA1}:{nonce}:{cnonce}")
|
||||
|
||||
if not qop:
|
||||
respdig = KD(HA1, f"{nonce}:{HA2}")
|
||||
elif qop == "auth" or "auth" in qop.split(","):
|
||||
noncebit = f"{nonce}:{ncvalue}:{cnonce}:auth:{HA2}"
|
||||
respdig = KD(HA1, noncebit)
|
||||
else:
|
||||
# XXX handle auth-int.
|
||||
return None
|
||||
|
||||
self._thread_local.last_nonce = nonce
|
||||
|
||||
# XXX should the partial digests be encoded too?
|
||||
base = (
|
||||
f'username="{self.username}", realm="{realm}", nonce="{nonce}", '
|
||||
f'uri="{path}", response="{respdig}"'
|
||||
)
|
||||
if opaque:
|
||||
base += f', opaque="{opaque}"'
|
||||
if algorithm:
|
||||
base += f', algorithm="{algorithm}"'
|
||||
if entdig:
|
||||
base += f', digest="{entdig}"'
|
||||
if qop:
|
||||
base += f', qop="auth", nc={ncvalue}, cnonce="{cnonce}"'
|
||||
|
||||
return f"Digest {base}"
|
||||
|
||||
def handle_redirect(self, r, **kwargs):
|
||||
"""Reset num_401_calls counter on redirects."""
|
||||
if r.is_redirect:
|
||||
self._thread_local.num_401_calls = 1
|
||||
|
||||
def handle_401(self, r, **kwargs):
|
||||
"""
|
||||
Takes the given response and tries digest-auth, if needed.
|
||||
|
||||
:rtype: requests.Response
|
||||
"""
|
||||
|
||||
# If response is not 4xx, do not auth
|
||||
# See https://github.com/psf/requests/issues/3772
|
||||
if not 400 <= r.status_code < 500:
|
||||
self._thread_local.num_401_calls = 1
|
||||
return r
|
||||
|
||||
if self._thread_local.pos is not None:
|
||||
# Rewind the file position indicator of the body to where
|
||||
# it was to resend the request.
|
||||
r.request.body.seek(self._thread_local.pos)
|
||||
s_auth = r.headers.get("www-authenticate", "")
|
||||
|
||||
if "digest" in s_auth.lower() and self._thread_local.num_401_calls < 2:
|
||||
|
||||
self._thread_local.num_401_calls += 1
|
||||
pat = re.compile(r"digest ", flags=re.IGNORECASE)
|
||||
self._thread_local.chal = parse_dict_header(pat.sub("", s_auth, count=1))
|
||||
|
||||
# Consume content and release the original connection
|
||||
# to allow our new request to reuse the same one.
|
||||
r.content
|
||||
r.close()
|
||||
prep = r.request.copy()
|
||||
extract_cookies_to_jar(prep._cookies, r.request, r.raw)
|
||||
prep.prepare_cookies(prep._cookies)
|
||||
|
||||
prep.headers["Authorization"] = self.build_digest_header(
|
||||
prep.method, prep.url
|
||||
)
|
||||
_r = r.connection.send(prep, **kwargs)
|
||||
_r.history.append(r)
|
||||
_r.request = prep
|
||||
|
||||
return _r
|
||||
|
||||
self._thread_local.num_401_calls = 1
|
||||
return r
|
||||
|
||||
def __call__(self, r):
|
||||
# Initialize per-thread state, if needed
|
||||
self.init_per_thread_state()
|
||||
# If we have a saved nonce, skip the 401
|
||||
if self._thread_local.last_nonce:
|
||||
r.headers["Authorization"] = self.build_digest_header(r.method, r.url)
|
||||
try:
|
||||
self._thread_local.pos = r.body.tell()
|
||||
except AttributeError:
|
||||
# In the case of HTTPDigestAuth being reused and the body of
|
||||
# the previous request was a file-like object, pos has the
|
||||
# file position of the previous body. Ensure it's set to
|
||||
# None.
|
||||
self._thread_local.pos = None
|
||||
r.register_hook("response", self.handle_401)
|
||||
r.register_hook("response", self.handle_redirect)
|
||||
self._thread_local.num_401_calls = 1
|
||||
|
||||
return r
|
||||
|
||||
def __eq__(self, other):
|
||||
return all(
|
||||
[
|
||||
self.username == getattr(other, "username", None),
|
||||
self.password == getattr(other, "password", None),
|
||||
]
|
||||
)
|
||||
|
||||
def __ne__(self, other):
|
||||
return not self == other
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user