added podman, json and yaml
This commit is contained in:
Binary file not shown.
0
containers.yaml
Normal file
0
containers.yaml
Normal file
52
main.py
52
main.py
@ -8,16 +8,27 @@ from pydantic import BaseModel
|
||||
from random import randrange
|
||||
import os
|
||||
import subprocess
|
||||
from podman import PodmanClient
|
||||
import json
|
||||
import yaml
|
||||
|
||||
app = FastAPI()
|
||||
|
||||
uri = "unix:///run/user/1000/podman/podman.sock"
|
||||
podmanapi = PodmanClient(base_url=uri)
|
||||
|
||||
yaml_file = open("containers.yaml", 'r')
|
||||
|
||||
class Post(BaseModel):
|
||||
title: str
|
||||
content: str
|
||||
published: bool = True
|
||||
rating: Optional[int] = None
|
||||
|
||||
|
||||
class Environment(BaseModel):
|
||||
username: str
|
||||
uid: str
|
||||
|
||||
my_posts= [{"title": "title of post 1", "content": "contents of post 1", "id": 1},{"title": "title of post 2", "content": "contents of post 2", "id": 2}]
|
||||
|
||||
def find_post(id):
|
||||
@ -71,15 +82,9 @@ async def update_post(id: int, post: Post):
|
||||
my_posts[index] = post_dict
|
||||
return {"data": post_dict}
|
||||
|
||||
@app.get("/create_env")
|
||||
async def ping_host():
|
||||
# send one packet of data to the host
|
||||
# this is specified by '-c 1' in the argument list
|
||||
outputlist = []
|
||||
# Iterate over all the servers in the list and ping each server
|
||||
# get the output as a string
|
||||
#output = str(os.system(cmd))
|
||||
#cmd = subprocess.run(["/bin/ls", "-al"],stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True, text=True)
|
||||
|
||||
@app.post("/create_env", status_code=status.HTTP_201_CREATED)
|
||||
async def create_env(env: Environment):
|
||||
cmd = subprocess.run(["cat","list"],stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True, text=True)
|
||||
# store the output in the list
|
||||
output = (f"{cmd.stdout}")
|
||||
@ -88,12 +93,20 @@ async def ping_host():
|
||||
|
||||
@app.get("/environment/{id}")
|
||||
async def get_env(id: str):
|
||||
#env = find_post(id)
|
||||
cmd = subprocess.run(["cat list | grep %s" % id],stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
|
||||
if cmd.returncode != 0:
|
||||
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail=f"Post id {id} not found")
|
||||
else:
|
||||
return {"post_detail": cmd}
|
||||
return {"environment_detail": cmd}
|
||||
|
||||
@app.delete("/environment/{id}")
|
||||
async def get_env(id: str):
|
||||
cmd = subprocess.run(["cat list | grep %s" % id],stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
|
||||
if cmd.returncode != 0:
|
||||
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail=f"Post id {id} not found")
|
||||
else:
|
||||
return {"Deleted environment": {id}}
|
||||
|
||||
|
||||
@app.get("/list")
|
||||
async def ping_host():
|
||||
@ -106,9 +119,16 @@ async def ping_host():
|
||||
#cmd = subprocess.run(["/bin/ls", "-al"],stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True, text=True)
|
||||
cmd = subprocess.run(["cat list"],stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
|
||||
#cmd = subprocess.run(["ls /usr/bin"],stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
|
||||
# store the output in the list
|
||||
#output = (f"{cmd.stdout}")
|
||||
#output1 = (f'{cmd.stdout.decode("utf-8")}')
|
||||
output2 = (f'{cmd.stdout.decode("utf-8").lower()}')
|
||||
print (output2)
|
||||
return output2
|
||||
#for image in podmanapi.images.list():
|
||||
# print(image, image.id, "\n")
|
||||
#output = podmanapi.images.list()
|
||||
#json_str = json.dumps(podmanapi.df(), indent=4)
|
||||
image = podmanapi.images.pull("docker.io/nginx", tag="latest")
|
||||
#container = podmanapi.containers.create(image="docker.io/library/nginx:latest",name='test',detach=True,publish_all_ports=True)
|
||||
container = podmanapi.containers.run(image="docker.io/library/nginx:latest",name='test',detach=True,publish_all_ports=True)
|
||||
#run_container = podmanapi.containers.run(image="docker.io/library/nginx:latest",name='test',stdout=True, stderr=False)
|
||||
#json_str = json.dumps(podmanapi.containers.list(), indent=4)
|
||||
#print(json.dumps(podmanapi.version(), indent=4))
|
||||
return Response(content=output2, media_type='application/json')
|
||||
8
venv/bin/ghp-import
Executable file
8
venv/bin/ghp-import
Executable file
@ -0,0 +1,8 @@
|
||||
#!/home/toverhag/Documents/gitrepos/fastapi_test/venv/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
import re
|
||||
import sys
|
||||
from ghp_import import main
|
||||
if __name__ == '__main__':
|
||||
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
|
||||
sys.exit(main())
|
||||
8
venv/bin/markdown_py
Executable file
8
venv/bin/markdown_py
Executable file
@ -0,0 +1,8 @@
|
||||
#!/home/toverhag/Documents/gitrepos/fastapi_test/venv/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
import re
|
||||
import sys
|
||||
from markdown.__main__ import run
|
||||
if __name__ == '__main__':
|
||||
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
|
||||
sys.exit(run())
|
||||
8
venv/bin/mkdocs
Executable file
8
venv/bin/mkdocs
Executable file
@ -0,0 +1,8 @@
|
||||
#!/home/toverhag/Documents/gitrepos/fastapi_test/venv/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
import re
|
||||
import sys
|
||||
from mkdocs.__main__ import cli
|
||||
if __name__ == '__main__':
|
||||
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
|
||||
sys.exit(cli())
|
||||
8
venv/bin/normalizer
Executable file
8
venv/bin/normalizer
Executable file
@ -0,0 +1,8 @@
|
||||
#!/home/toverhag/Documents/gitrepos/fastapi_test/venv/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
import re
|
||||
import sys
|
||||
from charset_normalizer.cli.normalizer import cli_detect
|
||||
if __name__ == '__main__':
|
||||
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
|
||||
sys.exit(cli_detect())
|
||||
8
venv/bin/watchmedo
Executable file
8
venv/bin/watchmedo
Executable file
@ -0,0 +1,8 @@
|
||||
#!/home/toverhag/Documents/gitrepos/fastapi_test/venv/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
import re
|
||||
import sys
|
||||
from watchdog.watchmedo import main
|
||||
if __name__ == '__main__':
|
||||
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
|
||||
sys.exit(main())
|
||||
@ -0,0 +1 @@
|
||||
pip
|
||||
@ -0,0 +1,29 @@
|
||||
Copyright 2007, 2008 The Python Markdown Project (v. 1.7 and later)
|
||||
Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b)
|
||||
Copyright 2004 Manfred Stienstra (the original version)
|
||||
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
* Neither the name of the Python Markdown Project nor the
|
||||
names of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE PYTHON MARKDOWN PROJECT ''AS IS'' AND ANY
|
||||
EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
DISCLAIMED. IN NO EVENT SHALL ANY CONTRIBUTORS TO THE PYTHON MARKDOWN PROJECT
|
||||
BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
@ -0,0 +1,109 @@
|
||||
Metadata-Version: 2.1
|
||||
Name: Markdown
|
||||
Version: 3.3.7
|
||||
Summary: Python implementation of Markdown.
|
||||
Home-page: https://Python-Markdown.github.io/
|
||||
Author: Manfred Stienstra, Yuri takhteyev and Waylan limberg
|
||||
Author-email: python.markdown@gmail.com
|
||||
Maintainer: Waylan Limberg
|
||||
Maintainer-email: python.markdown@gmail.com
|
||||
License: BSD License
|
||||
Project-URL: Documentation, https://Python-Markdown.github.io/
|
||||
Project-URL: GitHub Project, https://github.com/Python-Markdown/markdown
|
||||
Project-URL: Issue Tracker, https://github.com/Python-Markdown/markdown/issues
|
||||
Platform: UNKNOWN
|
||||
Classifier: Development Status :: 5 - Production/Stable
|
||||
Classifier: License :: OSI Approved :: BSD License
|
||||
Classifier: Operating System :: OS Independent
|
||||
Classifier: Programming Language :: Python
|
||||
Classifier: Programming Language :: Python :: 3
|
||||
Classifier: Programming Language :: Python :: 3.6
|
||||
Classifier: Programming Language :: Python :: 3.7
|
||||
Classifier: Programming Language :: Python :: 3.8
|
||||
Classifier: Programming Language :: Python :: 3.9
|
||||
Classifier: Programming Language :: Python :: 3.10
|
||||
Classifier: Programming Language :: Python :: 3 :: Only
|
||||
Classifier: Programming Language :: Python :: Implementation :: CPython
|
||||
Classifier: Programming Language :: Python :: Implementation :: PyPy
|
||||
Classifier: Topic :: Communications :: Email :: Filters
|
||||
Classifier: Topic :: Internet :: WWW/HTTP :: Dynamic Content :: CGI Tools/Libraries
|
||||
Classifier: Topic :: Internet :: WWW/HTTP :: Site Management
|
||||
Classifier: Topic :: Software Development :: Documentation
|
||||
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
||||
Classifier: Topic :: Text Processing :: Filters
|
||||
Classifier: Topic :: Text Processing :: Markup :: HTML
|
||||
Classifier: Topic :: Text Processing :: Markup :: Markdown
|
||||
Requires-Python: >=3.6
|
||||
Description-Content-Type: text/markdown
|
||||
License-File: LICENSE.md
|
||||
Requires-Dist: importlib-metadata (>=4.4) ; python_version < "3.10"
|
||||
Provides-Extra: testing
|
||||
Requires-Dist: coverage ; extra == 'testing'
|
||||
Requires-Dist: pyyaml ; extra == 'testing'
|
||||
|
||||
[Python-Markdown][]
|
||||
===================
|
||||
|
||||
[![Build Status][build-button]][build]
|
||||
[![Coverage Status][codecov-button]][codecov]
|
||||
[![Latest Version][mdversion-button]][md-pypi]
|
||||
[![Python Versions][pyversion-button]][md-pypi]
|
||||
[![BSD License][bsdlicense-button]][bsdlicense]
|
||||
[![Code of Conduct][codeofconduct-button]][Code of Conduct]
|
||||
|
||||
[build-button]: https://github.com/Python-Markdown/markdown/workflows/CI/badge.svg?event=push
|
||||
[build]: https://github.com/Python-Markdown/markdown/actions?query=workflow%3ACI+event%3Apush
|
||||
[codecov-button]: https://codecov.io/gh/Python-Markdown/markdown/branch/master/graph/badge.svg
|
||||
[codecov]: https://codecov.io/gh/Python-Markdown/markdown
|
||||
[mdversion-button]: https://img.shields.io/pypi/v/Markdown.svg
|
||||
[md-pypi]: https://pypi.org/project/Markdown/
|
||||
[pyversion-button]: https://img.shields.io/pypi/pyversions/Markdown.svg
|
||||
[bsdlicense-button]: https://img.shields.io/badge/license-BSD-yellow.svg
|
||||
[bsdlicense]: https://opensource.org/licenses/BSD-3-Clause
|
||||
[codeofconduct-button]: https://img.shields.io/badge/code%20of%20conduct-contributor%20covenant-green.svg?style=flat-square
|
||||
[Code of Conduct]: https://github.com/Python-Markdown/markdown/blob/master/CODE_OF_CONDUCT.md
|
||||
|
||||
This is a Python implementation of John Gruber's [Markdown][].
|
||||
It is almost completely compliant with the reference implementation,
|
||||
though there are a few known issues. See [Features][] for information
|
||||
on what exactly is supported and what is not. Additional features are
|
||||
supported by the [Available Extensions][].
|
||||
|
||||
[Python-Markdown]: https://Python-Markdown.github.io/
|
||||
[Markdown]: https://daringfireball.net/projects/markdown/
|
||||
[Features]: https://Python-Markdown.github.io#Features
|
||||
[Available Extensions]: https://Python-Markdown.github.io/extensions
|
||||
|
||||
Documentation
|
||||
-------------
|
||||
|
||||
```bash
|
||||
pip install markdown
|
||||
```
|
||||
```python
|
||||
import markdown
|
||||
html = markdown.markdown(your_text_string)
|
||||
```
|
||||
|
||||
For more advanced [installation] and [usage] documentation, see the `docs/` directory
|
||||
of the distribution or the project website at <https://Python-Markdown.github.io/>.
|
||||
|
||||
[installation]: https://python-markdown.github.io/install/
|
||||
[usage]: https://python-markdown.github.io/reference/
|
||||
|
||||
See the change log at <https://Python-Markdown.github.io/change_log>.
|
||||
|
||||
Support
|
||||
-------
|
||||
|
||||
You may report bugs, ask for help, and discuss various other issues on the [bug tracker][].
|
||||
|
||||
[bug tracker]: https://github.com/Python-Markdown/markdown/issues
|
||||
|
||||
Code of Conduct
|
||||
---------------
|
||||
|
||||
Everyone interacting in the Python-Markdown project's codebases, issue trackers,
|
||||
and mailing lists is expected to follow the [Code of Conduct].
|
||||
|
||||
|
||||
@ -0,0 +1,76 @@
|
||||
../../../bin/markdown_py,sha256=5bO7NpqwwsC9tIBJKVslf9gK935iQp-pRU1efIXs_cQ,260
|
||||
Markdown-3.3.7.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
|
||||
Markdown-3.3.7.dist-info/LICENSE.md,sha256=bxGTy2NHGOZcOlN9biXr1hSCDsDvaTz8EiSBEmONZNo,1645
|
||||
Markdown-3.3.7.dist-info/METADATA,sha256=yDPc4gkbiMBomDr_gIAPsQ8zVfkYoe9mRkon4KZ889A,4629
|
||||
Markdown-3.3.7.dist-info/RECORD,,
|
||||
Markdown-3.3.7.dist-info/WHEEL,sha256=G16H4A3IeoQmnOrYV4ueZGKSjhipXx8zc8nu9FGlvMA,92
|
||||
Markdown-3.3.7.dist-info/entry_points.txt,sha256=lMEyiiA_ZZyfPCBlDviBl-SiU0cfoeuEKpwxw361sKQ,1102
|
||||
Markdown-3.3.7.dist-info/top_level.txt,sha256=IAxs8x618RXoH1uCqeLLxXsDefJvE_mIibr_M4sOlyk,9
|
||||
markdown/__init__.py,sha256=002-LuHviYzROW2rg_gBGai81nMouUNO9UFj5nSsTSk,2065
|
||||
markdown/__main__.py,sha256=JX1057VoovH3NA5uH5nQdQE8b0kXoeT79ZxCzFoL_kg,5803
|
||||
markdown/__meta__.py,sha256=AjQfLZ5mSCOzSQk-HNK3jedyEPUyaZMTKwyXjRxCHso,1630
|
||||
markdown/__pycache__/__init__.cpython-311.pyc,,
|
||||
markdown/__pycache__/__main__.cpython-311.pyc,,
|
||||
markdown/__pycache__/__meta__.cpython-311.pyc,,
|
||||
markdown/__pycache__/blockparser.cpython-311.pyc,,
|
||||
markdown/__pycache__/blockprocessors.cpython-311.pyc,,
|
||||
markdown/__pycache__/core.cpython-311.pyc,,
|
||||
markdown/__pycache__/htmlparser.cpython-311.pyc,,
|
||||
markdown/__pycache__/inlinepatterns.cpython-311.pyc,,
|
||||
markdown/__pycache__/pep562.cpython-311.pyc,,
|
||||
markdown/__pycache__/postprocessors.cpython-311.pyc,,
|
||||
markdown/__pycache__/preprocessors.cpython-311.pyc,,
|
||||
markdown/__pycache__/serializers.cpython-311.pyc,,
|
||||
markdown/__pycache__/test_tools.cpython-311.pyc,,
|
||||
markdown/__pycache__/treeprocessors.cpython-311.pyc,,
|
||||
markdown/__pycache__/util.cpython-311.pyc,,
|
||||
markdown/blockparser.py,sha256=JpBhOokOoBUGCXolftOc5m1hPcR2y9s9hVd9WSuhHzo,4285
|
||||
markdown/blockprocessors.py,sha256=sUHIdAsGGAEN443TusbiNOWYt38DsHV-RqJdGjumIFc,24893
|
||||
markdown/core.py,sha256=ZHtqvLdVHOKWIuX_UzdL3rIcxMwji5TC5ZCkV19iM4U,15401
|
||||
markdown/extensions/__init__.py,sha256=kw5SehW1-IRGZk-ZR4aLQp2F0Qqj-Qql35qIBA8efnU,3663
|
||||
markdown/extensions/__pycache__/__init__.cpython-311.pyc,,
|
||||
markdown/extensions/__pycache__/abbr.cpython-311.pyc,,
|
||||
markdown/extensions/__pycache__/admonition.cpython-311.pyc,,
|
||||
markdown/extensions/__pycache__/attr_list.cpython-311.pyc,,
|
||||
markdown/extensions/__pycache__/codehilite.cpython-311.pyc,,
|
||||
markdown/extensions/__pycache__/def_list.cpython-311.pyc,,
|
||||
markdown/extensions/__pycache__/extra.cpython-311.pyc,,
|
||||
markdown/extensions/__pycache__/fenced_code.cpython-311.pyc,,
|
||||
markdown/extensions/__pycache__/footnotes.cpython-311.pyc,,
|
||||
markdown/extensions/__pycache__/legacy_attrs.cpython-311.pyc,,
|
||||
markdown/extensions/__pycache__/legacy_em.cpython-311.pyc,,
|
||||
markdown/extensions/__pycache__/md_in_html.cpython-311.pyc,,
|
||||
markdown/extensions/__pycache__/meta.cpython-311.pyc,,
|
||||
markdown/extensions/__pycache__/nl2br.cpython-311.pyc,,
|
||||
markdown/extensions/__pycache__/sane_lists.cpython-311.pyc,,
|
||||
markdown/extensions/__pycache__/smarty.cpython-311.pyc,,
|
||||
markdown/extensions/__pycache__/tables.cpython-311.pyc,,
|
||||
markdown/extensions/__pycache__/toc.cpython-311.pyc,,
|
||||
markdown/extensions/__pycache__/wikilinks.cpython-311.pyc,,
|
||||
markdown/extensions/abbr.py,sha256=5TNU5ml6-H1n-fztEkgUphSTvp5yKCXaiPZMrVuRFvo,3186
|
||||
markdown/extensions/admonition.py,sha256=INIecvdzQ7RLmgP8M-N6AZJ5uMd6dBfh9Uj6YibgNLk,5847
|
||||
markdown/extensions/attr_list.py,sha256=nhKFY_u6BVyKW2oMUeC4wEjqFNGpDSnNXqaohuF6M7I,5988
|
||||
markdown/extensions/codehilite.py,sha256=C4Jiuc-EwQQaedIGo_2sGyoZR6YgTR0FocusKWZc6Vg,11710
|
||||
markdown/extensions/def_list.py,sha256=HKStriCfwosWRjHgiph6hHIHwGjasEaE6UYW-_hstVo,3635
|
||||
markdown/extensions/extra.py,sha256=ruwYAcbIaFxAmcT4pLoaRdw8Ok6sFTYWza7OAstcvtI,1831
|
||||
markdown/extensions/fenced_code.py,sha256=Hd2RDaRWcCd4aI9fedoI6EElPmIVrD2BlvXdhRV64ik,7209
|
||||
markdown/extensions/footnotes.py,sha256=Ux13UAjNiptuyvCnDIII89YWTZ0DTrEmkrwcyen7ZgM,15485
|
||||
markdown/extensions/legacy_attrs.py,sha256=qx4d8c_mxt0JZ7wP9Sfskvi3cZN-OtDGTFCi4gapZ74,2547
|
||||
markdown/extensions/legacy_em.py,sha256=8mtzOGYu_FXKO7DrBVr_5v5ZH6ru1yv1TiobYBEFV5Q,1582
|
||||
markdown/extensions/md_in_html.py,sha256=F4CUIa2DjDPLEIuJCbmbw9jL1mbFloPhraedynZL9Ig,15829
|
||||
markdown/extensions/meta.py,sha256=EUfkzM7l7UpH__Or9K3pl8ldVddwndlCZWA3d712RAE,2331
|
||||
markdown/extensions/nl2br.py,sha256=wAqTNOuf2L1NzlEvEqoID70n9y-aiYaGLkuyQk3CD0w,783
|
||||
markdown/extensions/sane_lists.py,sha256=ZQmCf-247KBexVG0fc62nDvokGkV6W1uavYbieNKSG4,1505
|
||||
markdown/extensions/smarty.py,sha256=0padzkVCNACainKw-Xj1S5UfT0125VCTfNejmrCZItA,10238
|
||||
markdown/extensions/tables.py,sha256=u8gmond8fQqJB4nVcljvDG5EvEueMPn_UaPEp5o9hF0,7685
|
||||
markdown/extensions/toc.py,sha256=uOGElJ4K4-BAYGHdjwUT1Xv9iQ-PAgwI493NpyoRC9g,14136
|
||||
markdown/extensions/wikilinks.py,sha256=GkgT9BY7b1-qW--dIwFAhC9V20RoeF13b7CFdw_V21Q,2812
|
||||
markdown/htmlparser.py,sha256=Eios9Ui8L5IvdmlN8aTXioiTgOrrnUrxH2kPpLhgU0U,13033
|
||||
markdown/inlinepatterns.py,sha256=nVzm5T01WvCMSyLeopmVwacvYYBvQw0Ac-Svj_TnudI,29775
|
||||
markdown/pep562.py,sha256=5UkqT7sb-cQufgbOl_jF-RYUVVHS7VThzlMzR9vrd3I,8917
|
||||
markdown/postprocessors.py,sha256=NeJyWBqPeDuBBJLTGs5Bfm5oTkUBXk9HWBeQy2_OldI,4262
|
||||
markdown/preprocessors.py,sha256=-s8QGHGlX7JAIJTfCivuc-CVwTLWs0IyEU94YUT2IvQ,2742
|
||||
markdown/serializers.py,sha256=_wQl-iJrPSUEQ4Q1owWYqN9qceVh6TOlAOH_i44BKAQ,6540
|
||||
markdown/test_tools.py,sha256=Iht9NMNtDgtrMCNdjkMN9EWXjgsJjaFzJQVvzxo1Da0,8363
|
||||
markdown/treeprocessors.py,sha256=MIdj6cv1YiIafGFk8wy-hhV3ZQfgvuIdMErwA7286Ns,15434
|
||||
markdown/util.py,sha256=Xo8dhPcIwzYZ7RzzIGXoeC3Nq41aaP16B7JO076e28A,16063
|
||||
@ -0,0 +1,5 @@
|
||||
Wheel-Version: 1.0
|
||||
Generator: bdist_wheel (0.37.1)
|
||||
Root-Is-Purelib: true
|
||||
Tag: py3-none-any
|
||||
|
||||
@ -0,0 +1,22 @@
|
||||
[console_scripts]
|
||||
markdown_py = markdown.__main__:run
|
||||
|
||||
[markdown.extensions]
|
||||
abbr = markdown.extensions.abbr:AbbrExtension
|
||||
admonition = markdown.extensions.admonition:AdmonitionExtension
|
||||
attr_list = markdown.extensions.attr_list:AttrListExtension
|
||||
codehilite = markdown.extensions.codehilite:CodeHiliteExtension
|
||||
def_list = markdown.extensions.def_list:DefListExtension
|
||||
extra = markdown.extensions.extra:ExtraExtension
|
||||
fenced_code = markdown.extensions.fenced_code:FencedCodeExtension
|
||||
footnotes = markdown.extensions.footnotes:FootnoteExtension
|
||||
legacy_attrs = markdown.extensions.legacy_attrs:LegacyAttrExtension
|
||||
legacy_em = markdown.extensions.legacy_em:LegacyEmExtension
|
||||
md_in_html = markdown.extensions.md_in_html:MarkdownInHtmlExtension
|
||||
meta = markdown.extensions.meta:MetaExtension
|
||||
nl2br = markdown.extensions.nl2br:Nl2BrExtension
|
||||
sane_lists = markdown.extensions.sane_lists:SaneListExtension
|
||||
smarty = markdown.extensions.smarty:SmartyExtension
|
||||
tables = markdown.extensions.tables:TableExtension
|
||||
toc = markdown.extensions.toc:TocExtension
|
||||
wikilinks = markdown.extensions.wikilinks:WikiLinkExtension
|
||||
@ -0,0 +1 @@
|
||||
markdown
|
||||
Binary file not shown.
Binary file not shown.
@ -0,0 +1 @@
|
||||
pip
|
||||
@ -0,0 +1,21 @@
|
||||
MIT License
|
||||
|
||||
Copyright (c) 2019 TAHRI Ahmed R.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
||||
@ -0,0 +1,269 @@
|
||||
Metadata-Version: 2.1
|
||||
Name: charset-normalizer
|
||||
Version: 2.1.1
|
||||
Summary: The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet.
|
||||
Home-page: https://github.com/ousret/charset_normalizer
|
||||
Author: Ahmed TAHRI @Ousret
|
||||
Author-email: ahmed.tahri@cloudnursery.dev
|
||||
License: MIT
|
||||
Project-URL: Bug Reports, https://github.com/Ousret/charset_normalizer/issues
|
||||
Project-URL: Documentation, https://charset-normalizer.readthedocs.io/en/latest
|
||||
Keywords: encoding,i18n,txt,text,charset,charset-detector,normalization,unicode,chardet
|
||||
Classifier: Development Status :: 5 - Production/Stable
|
||||
Classifier: License :: OSI Approved :: MIT License
|
||||
Classifier: Intended Audience :: Developers
|
||||
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
||||
Classifier: Operating System :: OS Independent
|
||||
Classifier: Programming Language :: Python
|
||||
Classifier: Programming Language :: Python :: 3
|
||||
Classifier: Programming Language :: Python :: 3.6
|
||||
Classifier: Programming Language :: Python :: 3.7
|
||||
Classifier: Programming Language :: Python :: 3.8
|
||||
Classifier: Programming Language :: Python :: 3.9
|
||||
Classifier: Programming Language :: Python :: 3.10
|
||||
Classifier: Programming Language :: Python :: 3.11
|
||||
Classifier: Topic :: Text Processing :: Linguistic
|
||||
Classifier: Topic :: Utilities
|
||||
Classifier: Programming Language :: Python :: Implementation :: PyPy
|
||||
Classifier: Typing :: Typed
|
||||
Requires-Python: >=3.6.0
|
||||
Description-Content-Type: text/markdown
|
||||
License-File: LICENSE
|
||||
Provides-Extra: unicode_backport
|
||||
Requires-Dist: unicodedata2 ; extra == 'unicode_backport'
|
||||
|
||||
|
||||
<h1 align="center">Charset Detection, for Everyone 👋 <a href="https://twitter.com/intent/tweet?text=The%20Real%20First%20Universal%20Charset%20%26%20Language%20Detector&url=https://www.github.com/Ousret/charset_normalizer&hashtags=python,encoding,chardet,developers"><img src="https://img.shields.io/twitter/url/http/shields.io.svg?style=social"/></a></h1>
|
||||
|
||||
<p align="center">
|
||||
<sup>The Real First Universal Charset Detector</sup><br>
|
||||
<a href="https://pypi.org/project/charset-normalizer">
|
||||
<img src="https://img.shields.io/pypi/pyversions/charset_normalizer.svg?orange=blue" />
|
||||
</a>
|
||||
<a href="https://codecov.io/gh/Ousret/charset_normalizer">
|
||||
<img src="https://codecov.io/gh/Ousret/charset_normalizer/branch/master/graph/badge.svg" />
|
||||
</a>
|
||||
<a href="https://pepy.tech/project/charset-normalizer/">
|
||||
<img alt="Download Count Total" src="https://pepy.tech/badge/charset-normalizer/month" />
|
||||
</a>
|
||||
</p>
|
||||
|
||||
> A library that helps you read text from an unknown charset encoding.<br /> Motivated by `chardet`,
|
||||
> I'm trying to resolve the issue by taking a new approach.
|
||||
> All IANA character set names for which the Python core library provides codecs are supported.
|
||||
|
||||
<p align="center">
|
||||
>>>>> <a href="https://charsetnormalizerweb.ousret.now.sh" target="_blank">👉 Try Me Online Now, Then Adopt Me 👈 </a> <<<<<
|
||||
</p>
|
||||
|
||||
This project offers you an alternative to **Universal Charset Encoding Detector**, also known as **Chardet**.
|
||||
|
||||
| Feature | [Chardet](https://github.com/chardet/chardet) | Charset Normalizer | [cChardet](https://github.com/PyYoshi/cChardet) |
|
||||
| ------------- | :-------------: | :------------------: | :------------------: |
|
||||
| `Fast` | ❌<br> | ✅<br> | ✅ <br> |
|
||||
| `Universal**` | ❌ | ✅ | ❌ |
|
||||
| `Reliable` **without** distinguishable standards | ❌ | ✅ | ✅ |
|
||||
| `Reliable` **with** distinguishable standards | ✅ | ✅ | ✅ |
|
||||
| `License` | LGPL-2.1<br>_restrictive_ | MIT | MPL-1.1<br>_restrictive_ |
|
||||
| `Native Python` | ✅ | ✅ | ❌ |
|
||||
| `Detect spoken language` | ❌ | ✅ | N/A |
|
||||
| `UnicodeDecodeError Safety` | ❌ | ✅ | ❌ |
|
||||
| `Whl Size` | 193.6 kB | 39.5 kB | ~200 kB |
|
||||
| `Supported Encoding` | 33 | :tada: [93](https://charset-normalizer.readthedocs.io/en/latest/user/support.html#supported-encodings) | 40
|
||||
|
||||
<p align="center">
|
||||
<img src="https://i.imgflip.com/373iay.gif" alt="Reading Normalized Text" width="226"/><img src="https://media.tenor.com/images/c0180f70732a18b4965448d33adba3d0/tenor.gif" alt="Cat Reading Text" width="200"/>
|
||||
|
||||
*\*\* : They are clearly using specific code for a specific encoding even if covering most of used one*<br>
|
||||
Did you got there because of the logs? See [https://charset-normalizer.readthedocs.io/en/latest/user/miscellaneous.html](https://charset-normalizer.readthedocs.io/en/latest/user/miscellaneous.html)
|
||||
|
||||
## ⭐ Your support
|
||||
|
||||
*Fork, test-it, star-it, submit your ideas! We do listen.*
|
||||
|
||||
## ⚡ Performance
|
||||
|
||||
This package offer better performance than its counterpart Chardet. Here are some numbers.
|
||||
|
||||
| Package | Accuracy | Mean per file (ms) | File per sec (est) |
|
||||
| ------------- | :-------------: | :------------------: | :------------------: |
|
||||
| [chardet](https://github.com/chardet/chardet) | 86 % | 200 ms | 5 file/sec |
|
||||
| charset-normalizer | **98 %** | **39 ms** | 26 file/sec |
|
||||
|
||||
| Package | 99th percentile | 95th percentile | 50th percentile |
|
||||
| ------------- | :-------------: | :------------------: | :------------------: |
|
||||
| [chardet](https://github.com/chardet/chardet) | 1200 ms | 287 ms | 23 ms |
|
||||
| charset-normalizer | 400 ms | 200 ms | 15 ms |
|
||||
|
||||
Chardet's performance on larger file (1MB+) are very poor. Expect huge difference on large payload.
|
||||
|
||||
> Stats are generated using 400+ files using default parameters. More details on used files, see GHA workflows.
|
||||
> And yes, these results might change at any time. The dataset can be updated to include more files.
|
||||
> The actual delays heavily depends on your CPU capabilities. The factors should remain the same.
|
||||
> Keep in mind that the stats are generous and that Chardet accuracy vs our is measured using Chardet initial capability
|
||||
> (eg. Supported Encoding) Challenge-them if you want.
|
||||
|
||||
[cchardet](https://github.com/PyYoshi/cChardet) is a non-native (cpp binding) and unmaintained faster alternative with
|
||||
a better accuracy than chardet but lower than this package. If speed is the most important factor, you should try it.
|
||||
|
||||
## ✨ Installation
|
||||
|
||||
Using PyPi for latest stable
|
||||
```sh
|
||||
pip install charset-normalizer -U
|
||||
```
|
||||
|
||||
If you want a more up-to-date `unicodedata` than the one available in your Python setup.
|
||||
```sh
|
||||
pip install charset-normalizer[unicode_backport] -U
|
||||
```
|
||||
|
||||
## 🚀 Basic Usage
|
||||
|
||||
### CLI
|
||||
This package comes with a CLI.
|
||||
|
||||
```
|
||||
usage: normalizer [-h] [-v] [-a] [-n] [-m] [-r] [-f] [-t THRESHOLD]
|
||||
file [file ...]
|
||||
|
||||
The Real First Universal Charset Detector. Discover originating encoding used
|
||||
on text file. Normalize text to unicode.
|
||||
|
||||
positional arguments:
|
||||
files File(s) to be analysed
|
||||
|
||||
optional arguments:
|
||||
-h, --help show this help message and exit
|
||||
-v, --verbose Display complementary information about file if any.
|
||||
Stdout will contain logs about the detection process.
|
||||
-a, --with-alternative
|
||||
Output complementary possibilities if any. Top-level
|
||||
JSON WILL be a list.
|
||||
-n, --normalize Permit to normalize input file. If not set, program
|
||||
does not write anything.
|
||||
-m, --minimal Only output the charset detected to STDOUT. Disabling
|
||||
JSON output.
|
||||
-r, --replace Replace file when trying to normalize it instead of
|
||||
creating a new one.
|
||||
-f, --force Replace file without asking if you are sure, use this
|
||||
flag with caution.
|
||||
-t THRESHOLD, --threshold THRESHOLD
|
||||
Define a custom maximum amount of chaos allowed in
|
||||
decoded content. 0. <= chaos <= 1.
|
||||
--version Show version information and exit.
|
||||
```
|
||||
|
||||
```bash
|
||||
normalizer ./data/sample.1.fr.srt
|
||||
```
|
||||
|
||||
:tada: Since version 1.4.0 the CLI produce easily usable stdout result in JSON format.
|
||||
|
||||
```json
|
||||
{
|
||||
"path": "/home/default/projects/charset_normalizer/data/sample.1.fr.srt",
|
||||
"encoding": "cp1252",
|
||||
"encoding_aliases": [
|
||||
"1252",
|
||||
"windows_1252"
|
||||
],
|
||||
"alternative_encodings": [
|
||||
"cp1254",
|
||||
"cp1256",
|
||||
"cp1258",
|
||||
"iso8859_14",
|
||||
"iso8859_15",
|
||||
"iso8859_16",
|
||||
"iso8859_3",
|
||||
"iso8859_9",
|
||||
"latin_1",
|
||||
"mbcs"
|
||||
],
|
||||
"language": "French",
|
||||
"alphabets": [
|
||||
"Basic Latin",
|
||||
"Latin-1 Supplement"
|
||||
],
|
||||
"has_sig_or_bom": false,
|
||||
"chaos": 0.149,
|
||||
"coherence": 97.152,
|
||||
"unicode_path": null,
|
||||
"is_preferred": true
|
||||
}
|
||||
```
|
||||
|
||||
### Python
|
||||
*Just print out normalized text*
|
||||
```python
|
||||
from charset_normalizer import from_path
|
||||
|
||||
results = from_path('./my_subtitle.srt')
|
||||
|
||||
print(str(results.best()))
|
||||
```
|
||||
|
||||
*Normalize any text file*
|
||||
```python
|
||||
from charset_normalizer import normalize
|
||||
try:
|
||||
normalize('./my_subtitle.srt') # should write to disk my_subtitle-***.srt
|
||||
except IOError as e:
|
||||
print('Sadly, we are unable to perform charset normalization.', str(e))
|
||||
```
|
||||
|
||||
*Upgrade your code without effort*
|
||||
```python
|
||||
from charset_normalizer import detect
|
||||
```
|
||||
|
||||
The above code will behave the same as **chardet**. We ensure that we offer the best (reasonable) BC result possible.
|
||||
|
||||
See the docs for advanced usage : [readthedocs.io](https://charset-normalizer.readthedocs.io/en/latest/)
|
||||
|
||||
## 😇 Why
|
||||
|
||||
When I started using Chardet, I noticed that it was not suited to my expectations, and I wanted to propose a
|
||||
reliable alternative using a completely different method. Also! I never back down on a good challenge!
|
||||
|
||||
I **don't care** about the **originating charset** encoding, because **two different tables** can
|
||||
produce **two identical rendered string.**
|
||||
What I want is to get readable text, the best I can.
|
||||
|
||||
In a way, **I'm brute forcing text decoding.** How cool is that ? 😎
|
||||
|
||||
Don't confuse package **ftfy** with charset-normalizer or chardet. ftfy goal is to repair unicode string whereas charset-normalizer to convert raw file in unknown encoding to unicode.
|
||||
|
||||
## 🍰 How
|
||||
|
||||
- Discard all charset encoding table that could not fit the binary content.
|
||||
- Measure chaos, or the mess once opened (by chunks) with a corresponding charset encoding.
|
||||
- Extract matches with the lowest mess detected.
|
||||
- Additionally, we measure coherence / probe for a language.
|
||||
|
||||
**Wait a minute**, what is chaos/mess and coherence according to **YOU ?**
|
||||
|
||||
*Chaos :* I opened hundred of text files, **written by humans**, with the wrong encoding table. **I observed**, then
|
||||
**I established** some ground rules about **what is obvious** when **it seems like** a mess.
|
||||
I know that my interpretation of what is chaotic is very subjective, feel free to contribute in order to
|
||||
improve or rewrite it.
|
||||
|
||||
*Coherence :* For each language there is on earth, we have computed ranked letter appearance occurrences (the best we can). So I thought
|
||||
that intel is worth something here. So I use those records against decoded text to check if I can detect intelligent design.
|
||||
|
||||
## ⚡ Known limitations
|
||||
|
||||
- Language detection is unreliable when text contains two or more languages sharing identical letters. (eg. HTML (english tags) + Turkish content (Sharing Latin characters))
|
||||
- Every charset detector heavily depends on sufficient content. In common cases, do not bother run detection on very tiny content.
|
||||
|
||||
## 👤 Contributing
|
||||
|
||||
Contributions, issues and feature requests are very much welcome.<br />
|
||||
Feel free to check [issues page](https://github.com/ousret/charset_normalizer/issues) if you want to contribute.
|
||||
|
||||
## 📝 License
|
||||
|
||||
Copyright © 2019 [Ahmed TAHRI @Ousret](https://github.com/Ousret).<br />
|
||||
This project is [MIT](https://github.com/Ousret/charset_normalizer/blob/master/LICENSE) licensed.
|
||||
|
||||
Characters frequencies used in this project © 2012 [Denny Vrandečić](http://simia.net/letters/)
|
||||
@ -0,0 +1,33 @@
|
||||
../../../bin/normalizer,sha256=b3q6GnKL6HKDTJiUkB-KSeBZ9IklrLVlcj-BQXVT6kY,290
|
||||
charset_normalizer-2.1.1.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
|
||||
charset_normalizer-2.1.1.dist-info/LICENSE,sha256=6zGgxaT7Cbik4yBV0lweX5w1iidS_vPNcgIT0cz-4kE,1070
|
||||
charset_normalizer-2.1.1.dist-info/METADATA,sha256=C99l12g4d1E9_UiW-mqPCWx7v2M_lYGWxy1GTOjXSsA,11942
|
||||
charset_normalizer-2.1.1.dist-info/RECORD,,
|
||||
charset_normalizer-2.1.1.dist-info/WHEEL,sha256=G16H4A3IeoQmnOrYV4ueZGKSjhipXx8zc8nu9FGlvMA,92
|
||||
charset_normalizer-2.1.1.dist-info/entry_points.txt,sha256=uYo8aIGLWv8YgWfSna5HnfY_En4pkF1w4bgawNAXzP0,76
|
||||
charset_normalizer-2.1.1.dist-info/top_level.txt,sha256=7ASyzePr8_xuZWJsnqJjIBtyV8vhEo0wBCv1MPRRi3Q,19
|
||||
charset_normalizer/__init__.py,sha256=jGhhf1IcOgCpZsr593E9fPvjWKnflVqHe_LwkOJjInU,1790
|
||||
charset_normalizer/__pycache__/__init__.cpython-311.pyc,,
|
||||
charset_normalizer/__pycache__/api.cpython-311.pyc,,
|
||||
charset_normalizer/__pycache__/cd.cpython-311.pyc,,
|
||||
charset_normalizer/__pycache__/constant.cpython-311.pyc,,
|
||||
charset_normalizer/__pycache__/legacy.cpython-311.pyc,,
|
||||
charset_normalizer/__pycache__/md.cpython-311.pyc,,
|
||||
charset_normalizer/__pycache__/models.cpython-311.pyc,,
|
||||
charset_normalizer/__pycache__/utils.cpython-311.pyc,,
|
||||
charset_normalizer/__pycache__/version.cpython-311.pyc,,
|
||||
charset_normalizer/api.py,sha256=euVPmjAMbjpqhEHPjfKtyy1mK52U0TOUBUQgM_Qy6eE,19191
|
||||
charset_normalizer/assets/__init__.py,sha256=r7aakPaRIc2FFG2mw2V8NOTvkl25_euKZ3wPf5SAVa4,15222
|
||||
charset_normalizer/assets/__pycache__/__init__.cpython-311.pyc,,
|
||||
charset_normalizer/cd.py,sha256=Pxdkbn4cy0iZF42KTb1FiWIqqKobuz_fDjGwc6JMNBc,10811
|
||||
charset_normalizer/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
||||
charset_normalizer/cli/__pycache__/__init__.cpython-311.pyc,,
|
||||
charset_normalizer/cli/__pycache__/normalizer.cpython-311.pyc,,
|
||||
charset_normalizer/cli/normalizer.py,sha256=FmD1RXeMpRBg_mjR0MaJhNUpM2qZ8wz2neAE7AayBeg,9521
|
||||
charset_normalizer/constant.py,sha256=NgU-pY8JH2a9lkVT8oKwAFmIUYNKOuSBwZgF9MrlNCM,19157
|
||||
charset_normalizer/legacy.py,sha256=XKeZOts_HdYQU_Jb3C9ZfOjY2CiUL132k9_nXer8gig,3384
|
||||
charset_normalizer/md.py,sha256=pZP8IVpSC82D8INA9Tf_y0ijJSRI-UIncZvLdfTWEd4,17642
|
||||
charset_normalizer/models.py,sha256=i68YdlSLTEI3EEBVXq8TLNAbyyjrLC2OWszc-OBAk9I,13167
|
||||
charset_normalizer/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
||||
charset_normalizer/utils.py,sha256=ykOznhcAeH-ODLBWJuI7t1nbwa1SAfN_bDYTCJGyh4U,11771
|
||||
charset_normalizer/version.py,sha256=_eh2MA3qS__IajlePQxKBmlw6zaBDvPYlLdEgxgIojw,79
|
||||
@ -0,0 +1,5 @@
|
||||
Wheel-Version: 1.0
|
||||
Generator: bdist_wheel (0.37.1)
|
||||
Root-Is-Purelib: true
|
||||
Tag: py3-none-any
|
||||
|
||||
@ -0,0 +1,2 @@
|
||||
[console_scripts]
|
||||
normalizer = charset_normalizer.cli.normalizer:cli_detect
|
||||
@ -0,0 +1 @@
|
||||
charset_normalizer
|
||||
@ -0,0 +1,56 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Charset-Normalizer
|
||||
~~~~~~~~~~~~~~
|
||||
The Real First Universal Charset Detector.
|
||||
A library that helps you read text from an unknown charset encoding.
|
||||
Motivated by chardet, This package is trying to resolve the issue by taking a new approach.
|
||||
All IANA character set names for which the Python core library provides codecs are supported.
|
||||
|
||||
Basic usage:
|
||||
>>> from charset_normalizer import from_bytes
|
||||
>>> results = from_bytes('Bсеки човек има право на образование. Oбразованието!'.encode('utf_8'))
|
||||
>>> best_guess = results.best()
|
||||
>>> str(best_guess)
|
||||
'Bсеки човек има право на образование. Oбразованието!'
|
||||
|
||||
Others methods and usages are available - see the full documentation
|
||||
at <https://github.com/Ousret/charset_normalizer>.
|
||||
:copyright: (c) 2021 by Ahmed TAHRI
|
||||
:license: MIT, see LICENSE for more details.
|
||||
"""
|
||||
import logging
|
||||
|
||||
from .api import from_bytes, from_fp, from_path, normalize
|
||||
from .legacy import (
|
||||
CharsetDetector,
|
||||
CharsetDoctor,
|
||||
CharsetNormalizerMatch,
|
||||
CharsetNormalizerMatches,
|
||||
detect,
|
||||
)
|
||||
from .models import CharsetMatch, CharsetMatches
|
||||
from .utils import set_logging_handler
|
||||
from .version import VERSION, __version__
|
||||
|
||||
__all__ = (
|
||||
"from_fp",
|
||||
"from_path",
|
||||
"from_bytes",
|
||||
"normalize",
|
||||
"detect",
|
||||
"CharsetMatch",
|
||||
"CharsetMatches",
|
||||
"CharsetNormalizerMatch",
|
||||
"CharsetNormalizerMatches",
|
||||
"CharsetDetector",
|
||||
"CharsetDoctor",
|
||||
"__version__",
|
||||
"VERSION",
|
||||
"set_logging_handler",
|
||||
)
|
||||
|
||||
# Attach a NullHandler to the top level logger by default
|
||||
# https://docs.python.org/3.3/howto/logging.html#configuring-logging-for-a-library
|
||||
|
||||
logging.getLogger("charset_normalizer").addHandler(logging.NullHandler())
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
584
venv/lib/python3.11/site-packages/charset_normalizer/api.py
Normal file
584
venv/lib/python3.11/site-packages/charset_normalizer/api.py
Normal file
@ -0,0 +1,584 @@
|
||||
import logging
|
||||
import warnings
|
||||
from os import PathLike
|
||||
from os.path import basename, splitext
|
||||
from typing import Any, BinaryIO, List, Optional, Set
|
||||
|
||||
from .cd import (
|
||||
coherence_ratio,
|
||||
encoding_languages,
|
||||
mb_encoding_languages,
|
||||
merge_coherence_ratios,
|
||||
)
|
||||
from .constant import IANA_SUPPORTED, TOO_BIG_SEQUENCE, TOO_SMALL_SEQUENCE, TRACE
|
||||
from .md import mess_ratio
|
||||
from .models import CharsetMatch, CharsetMatches
|
||||
from .utils import (
|
||||
any_specified_encoding,
|
||||
cut_sequence_chunks,
|
||||
iana_name,
|
||||
identify_sig_or_bom,
|
||||
is_cp_similar,
|
||||
is_multi_byte_encoding,
|
||||
should_strip_sig_or_bom,
|
||||
)
|
||||
|
||||
# Will most likely be controversial
|
||||
# logging.addLevelName(TRACE, "TRACE")
|
||||
logger = logging.getLogger("charset_normalizer")
|
||||
explain_handler = logging.StreamHandler()
|
||||
explain_handler.setFormatter(
|
||||
logging.Formatter("%(asctime)s | %(levelname)s | %(message)s")
|
||||
)
|
||||
|
||||
|
||||
def from_bytes(
|
||||
sequences: bytes,
|
||||
steps: int = 5,
|
||||
chunk_size: int = 512,
|
||||
threshold: float = 0.2,
|
||||
cp_isolation: Optional[List[str]] = None,
|
||||
cp_exclusion: Optional[List[str]] = None,
|
||||
preemptive_behaviour: bool = True,
|
||||
explain: bool = False,
|
||||
) -> CharsetMatches:
|
||||
"""
|
||||
Given a raw bytes sequence, return the best possibles charset usable to render str objects.
|
||||
If there is no results, it is a strong indicator that the source is binary/not text.
|
||||
By default, the process will extract 5 blocs of 512o each to assess the mess and coherence of a given sequence.
|
||||
And will give up a particular code page after 20% of measured mess. Those criteria are customizable at will.
|
||||
|
||||
The preemptive behavior DOES NOT replace the traditional detection workflow, it prioritize a particular code page
|
||||
but never take it for granted. Can improve the performance.
|
||||
|
||||
You may want to focus your attention to some code page or/and not others, use cp_isolation and cp_exclusion for that
|
||||
purpose.
|
||||
|
||||
This function will strip the SIG in the payload/sequence every time except on UTF-16, UTF-32.
|
||||
By default the library does not setup any handler other than the NullHandler, if you choose to set the 'explain'
|
||||
toggle to True it will alter the logger configuration to add a StreamHandler that is suitable for debugging.
|
||||
Custom logging format and handler can be set manually.
|
||||
"""
|
||||
|
||||
if not isinstance(sequences, (bytearray, bytes)):
|
||||
raise TypeError(
|
||||
"Expected object of type bytes or bytearray, got: {0}".format(
|
||||
type(sequences)
|
||||
)
|
||||
)
|
||||
|
||||
if explain:
|
||||
previous_logger_level: int = logger.level
|
||||
logger.addHandler(explain_handler)
|
||||
logger.setLevel(TRACE)
|
||||
|
||||
length: int = len(sequences)
|
||||
|
||||
if length == 0:
|
||||
logger.debug("Encoding detection on empty bytes, assuming utf_8 intention.")
|
||||
if explain:
|
||||
logger.removeHandler(explain_handler)
|
||||
logger.setLevel(previous_logger_level or logging.WARNING)
|
||||
return CharsetMatches([CharsetMatch(sequences, "utf_8", 0.0, False, [], "")])
|
||||
|
||||
if cp_isolation is not None:
|
||||
logger.log(
|
||||
TRACE,
|
||||
"cp_isolation is set. use this flag for debugging purpose. "
|
||||
"limited list of encoding allowed : %s.",
|
||||
", ".join(cp_isolation),
|
||||
)
|
||||
cp_isolation = [iana_name(cp, False) for cp in cp_isolation]
|
||||
else:
|
||||
cp_isolation = []
|
||||
|
||||
if cp_exclusion is not None:
|
||||
logger.log(
|
||||
TRACE,
|
||||
"cp_exclusion is set. use this flag for debugging purpose. "
|
||||
"limited list of encoding excluded : %s.",
|
||||
", ".join(cp_exclusion),
|
||||
)
|
||||
cp_exclusion = [iana_name(cp, False) for cp in cp_exclusion]
|
||||
else:
|
||||
cp_exclusion = []
|
||||
|
||||
if length <= (chunk_size * steps):
|
||||
logger.log(
|
||||
TRACE,
|
||||
"override steps (%i) and chunk_size (%i) as content does not fit (%i byte(s) given) parameters.",
|
||||
steps,
|
||||
chunk_size,
|
||||
length,
|
||||
)
|
||||
steps = 1
|
||||
chunk_size = length
|
||||
|
||||
if steps > 1 and length / steps < chunk_size:
|
||||
chunk_size = int(length / steps)
|
||||
|
||||
is_too_small_sequence: bool = len(sequences) < TOO_SMALL_SEQUENCE
|
||||
is_too_large_sequence: bool = len(sequences) >= TOO_BIG_SEQUENCE
|
||||
|
||||
if is_too_small_sequence:
|
||||
logger.log(
|
||||
TRACE,
|
||||
"Trying to detect encoding from a tiny portion of ({}) byte(s).".format(
|
||||
length
|
||||
),
|
||||
)
|
||||
elif is_too_large_sequence:
|
||||
logger.log(
|
||||
TRACE,
|
||||
"Using lazy str decoding because the payload is quite large, ({}) byte(s).".format(
|
||||
length
|
||||
),
|
||||
)
|
||||
|
||||
prioritized_encodings: List[str] = []
|
||||
|
||||
specified_encoding: Optional[str] = (
|
||||
any_specified_encoding(sequences) if preemptive_behaviour else None
|
||||
)
|
||||
|
||||
if specified_encoding is not None:
|
||||
prioritized_encodings.append(specified_encoding)
|
||||
logger.log(
|
||||
TRACE,
|
||||
"Detected declarative mark in sequence. Priority +1 given for %s.",
|
||||
specified_encoding,
|
||||
)
|
||||
|
||||
tested: Set[str] = set()
|
||||
tested_but_hard_failure: List[str] = []
|
||||
tested_but_soft_failure: List[str] = []
|
||||
|
||||
fallback_ascii: Optional[CharsetMatch] = None
|
||||
fallback_u8: Optional[CharsetMatch] = None
|
||||
fallback_specified: Optional[CharsetMatch] = None
|
||||
|
||||
results: CharsetMatches = CharsetMatches()
|
||||
|
||||
sig_encoding, sig_payload = identify_sig_or_bom(sequences)
|
||||
|
||||
if sig_encoding is not None:
|
||||
prioritized_encodings.append(sig_encoding)
|
||||
logger.log(
|
||||
TRACE,
|
||||
"Detected a SIG or BOM mark on first %i byte(s). Priority +1 given for %s.",
|
||||
len(sig_payload),
|
||||
sig_encoding,
|
||||
)
|
||||
|
||||
prioritized_encodings.append("ascii")
|
||||
|
||||
if "utf_8" not in prioritized_encodings:
|
||||
prioritized_encodings.append("utf_8")
|
||||
|
||||
for encoding_iana in prioritized_encodings + IANA_SUPPORTED:
|
||||
|
||||
if cp_isolation and encoding_iana not in cp_isolation:
|
||||
continue
|
||||
|
||||
if cp_exclusion and encoding_iana in cp_exclusion:
|
||||
continue
|
||||
|
||||
if encoding_iana in tested:
|
||||
continue
|
||||
|
||||
tested.add(encoding_iana)
|
||||
|
||||
decoded_payload: Optional[str] = None
|
||||
bom_or_sig_available: bool = sig_encoding == encoding_iana
|
||||
strip_sig_or_bom: bool = bom_or_sig_available and should_strip_sig_or_bom(
|
||||
encoding_iana
|
||||
)
|
||||
|
||||
if encoding_iana in {"utf_16", "utf_32"} and not bom_or_sig_available:
|
||||
logger.log(
|
||||
TRACE,
|
||||
"Encoding %s wont be tested as-is because it require a BOM. Will try some sub-encoder LE/BE.",
|
||||
encoding_iana,
|
||||
)
|
||||
continue
|
||||
|
||||
try:
|
||||
is_multi_byte_decoder: bool = is_multi_byte_encoding(encoding_iana)
|
||||
except (ModuleNotFoundError, ImportError):
|
||||
logger.log(
|
||||
TRACE,
|
||||
"Encoding %s does not provide an IncrementalDecoder",
|
||||
encoding_iana,
|
||||
)
|
||||
continue
|
||||
|
||||
try:
|
||||
if is_too_large_sequence and is_multi_byte_decoder is False:
|
||||
str(
|
||||
sequences[: int(50e4)]
|
||||
if strip_sig_or_bom is False
|
||||
else sequences[len(sig_payload) : int(50e4)],
|
||||
encoding=encoding_iana,
|
||||
)
|
||||
else:
|
||||
decoded_payload = str(
|
||||
sequences
|
||||
if strip_sig_or_bom is False
|
||||
else sequences[len(sig_payload) :],
|
||||
encoding=encoding_iana,
|
||||
)
|
||||
except (UnicodeDecodeError, LookupError) as e:
|
||||
if not isinstance(e, LookupError):
|
||||
logger.log(
|
||||
TRACE,
|
||||
"Code page %s does not fit given bytes sequence at ALL. %s",
|
||||
encoding_iana,
|
||||
str(e),
|
||||
)
|
||||
tested_but_hard_failure.append(encoding_iana)
|
||||
continue
|
||||
|
||||
similar_soft_failure_test: bool = False
|
||||
|
||||
for encoding_soft_failed in tested_but_soft_failure:
|
||||
if is_cp_similar(encoding_iana, encoding_soft_failed):
|
||||
similar_soft_failure_test = True
|
||||
break
|
||||
|
||||
if similar_soft_failure_test:
|
||||
logger.log(
|
||||
TRACE,
|
||||
"%s is deemed too similar to code page %s and was consider unsuited already. Continuing!",
|
||||
encoding_iana,
|
||||
encoding_soft_failed,
|
||||
)
|
||||
continue
|
||||
|
||||
r_ = range(
|
||||
0 if not bom_or_sig_available else len(sig_payload),
|
||||
length,
|
||||
int(length / steps),
|
||||
)
|
||||
|
||||
multi_byte_bonus: bool = (
|
||||
is_multi_byte_decoder
|
||||
and decoded_payload is not None
|
||||
and len(decoded_payload) < length
|
||||
)
|
||||
|
||||
if multi_byte_bonus:
|
||||
logger.log(
|
||||
TRACE,
|
||||
"Code page %s is a multi byte encoding table and it appear that at least one character "
|
||||
"was encoded using n-bytes.",
|
||||
encoding_iana,
|
||||
)
|
||||
|
||||
max_chunk_gave_up: int = int(len(r_) / 4)
|
||||
|
||||
max_chunk_gave_up = max(max_chunk_gave_up, 2)
|
||||
early_stop_count: int = 0
|
||||
lazy_str_hard_failure = False
|
||||
|
||||
md_chunks: List[str] = []
|
||||
md_ratios = []
|
||||
|
||||
try:
|
||||
for chunk in cut_sequence_chunks(
|
||||
sequences,
|
||||
encoding_iana,
|
||||
r_,
|
||||
chunk_size,
|
||||
bom_or_sig_available,
|
||||
strip_sig_or_bom,
|
||||
sig_payload,
|
||||
is_multi_byte_decoder,
|
||||
decoded_payload,
|
||||
):
|
||||
md_chunks.append(chunk)
|
||||
|
||||
md_ratios.append(mess_ratio(chunk, threshold))
|
||||
|
||||
if md_ratios[-1] >= threshold:
|
||||
early_stop_count += 1
|
||||
|
||||
if (early_stop_count >= max_chunk_gave_up) or (
|
||||
bom_or_sig_available and strip_sig_or_bom is False
|
||||
):
|
||||
break
|
||||
except UnicodeDecodeError as e: # Lazy str loading may have missed something there
|
||||
logger.log(
|
||||
TRACE,
|
||||
"LazyStr Loading: After MD chunk decode, code page %s does not fit given bytes sequence at ALL. %s",
|
||||
encoding_iana,
|
||||
str(e),
|
||||
)
|
||||
early_stop_count = max_chunk_gave_up
|
||||
lazy_str_hard_failure = True
|
||||
|
||||
# We might want to check the sequence again with the whole content
|
||||
# Only if initial MD tests passes
|
||||
if (
|
||||
not lazy_str_hard_failure
|
||||
and is_too_large_sequence
|
||||
and not is_multi_byte_decoder
|
||||
):
|
||||
try:
|
||||
sequences[int(50e3) :].decode(encoding_iana, errors="strict")
|
||||
except UnicodeDecodeError as e:
|
||||
logger.log(
|
||||
TRACE,
|
||||
"LazyStr Loading: After final lookup, code page %s does not fit given bytes sequence at ALL. %s",
|
||||
encoding_iana,
|
||||
str(e),
|
||||
)
|
||||
tested_but_hard_failure.append(encoding_iana)
|
||||
continue
|
||||
|
||||
mean_mess_ratio: float = sum(md_ratios) / len(md_ratios) if md_ratios else 0.0
|
||||
if mean_mess_ratio >= threshold or early_stop_count >= max_chunk_gave_up:
|
||||
tested_but_soft_failure.append(encoding_iana)
|
||||
logger.log(
|
||||
TRACE,
|
||||
"%s was excluded because of initial chaos probing. Gave up %i time(s). "
|
||||
"Computed mean chaos is %f %%.",
|
||||
encoding_iana,
|
||||
early_stop_count,
|
||||
round(mean_mess_ratio * 100, ndigits=3),
|
||||
)
|
||||
# Preparing those fallbacks in case we got nothing.
|
||||
if (
|
||||
encoding_iana in ["ascii", "utf_8", specified_encoding]
|
||||
and not lazy_str_hard_failure
|
||||
):
|
||||
fallback_entry = CharsetMatch(
|
||||
sequences, encoding_iana, threshold, False, [], decoded_payload
|
||||
)
|
||||
if encoding_iana == specified_encoding:
|
||||
fallback_specified = fallback_entry
|
||||
elif encoding_iana == "ascii":
|
||||
fallback_ascii = fallback_entry
|
||||
else:
|
||||
fallback_u8 = fallback_entry
|
||||
continue
|
||||
|
||||
logger.log(
|
||||
TRACE,
|
||||
"%s passed initial chaos probing. Mean measured chaos is %f %%",
|
||||
encoding_iana,
|
||||
round(mean_mess_ratio * 100, ndigits=3),
|
||||
)
|
||||
|
||||
if not is_multi_byte_decoder:
|
||||
target_languages: List[str] = encoding_languages(encoding_iana)
|
||||
else:
|
||||
target_languages = mb_encoding_languages(encoding_iana)
|
||||
|
||||
if target_languages:
|
||||
logger.log(
|
||||
TRACE,
|
||||
"{} should target any language(s) of {}".format(
|
||||
encoding_iana, str(target_languages)
|
||||
),
|
||||
)
|
||||
|
||||
cd_ratios = []
|
||||
|
||||
# We shall skip the CD when its about ASCII
|
||||
# Most of the time its not relevant to run "language-detection" on it.
|
||||
if encoding_iana != "ascii":
|
||||
for chunk in md_chunks:
|
||||
chunk_languages = coherence_ratio(
|
||||
chunk, 0.1, ",".join(target_languages) if target_languages else None
|
||||
)
|
||||
|
||||
cd_ratios.append(chunk_languages)
|
||||
|
||||
cd_ratios_merged = merge_coherence_ratios(cd_ratios)
|
||||
|
||||
if cd_ratios_merged:
|
||||
logger.log(
|
||||
TRACE,
|
||||
"We detected language {} using {}".format(
|
||||
cd_ratios_merged, encoding_iana
|
||||
),
|
||||
)
|
||||
|
||||
results.append(
|
||||
CharsetMatch(
|
||||
sequences,
|
||||
encoding_iana,
|
||||
mean_mess_ratio,
|
||||
bom_or_sig_available,
|
||||
cd_ratios_merged,
|
||||
decoded_payload,
|
||||
)
|
||||
)
|
||||
|
||||
if (
|
||||
encoding_iana in [specified_encoding, "ascii", "utf_8"]
|
||||
and mean_mess_ratio < 0.1
|
||||
):
|
||||
logger.debug(
|
||||
"Encoding detection: %s is most likely the one.", encoding_iana
|
||||
)
|
||||
if explain:
|
||||
logger.removeHandler(explain_handler)
|
||||
logger.setLevel(previous_logger_level)
|
||||
return CharsetMatches([results[encoding_iana]])
|
||||
|
||||
if encoding_iana == sig_encoding:
|
||||
logger.debug(
|
||||
"Encoding detection: %s is most likely the one as we detected a BOM or SIG within "
|
||||
"the beginning of the sequence.",
|
||||
encoding_iana,
|
||||
)
|
||||
if explain:
|
||||
logger.removeHandler(explain_handler)
|
||||
logger.setLevel(previous_logger_level)
|
||||
return CharsetMatches([results[encoding_iana]])
|
||||
|
||||
if len(results) == 0:
|
||||
if fallback_u8 or fallback_ascii or fallback_specified:
|
||||
logger.log(
|
||||
TRACE,
|
||||
"Nothing got out of the detection process. Using ASCII/UTF-8/Specified fallback.",
|
||||
)
|
||||
|
||||
if fallback_specified:
|
||||
logger.debug(
|
||||
"Encoding detection: %s will be used as a fallback match",
|
||||
fallback_specified.encoding,
|
||||
)
|
||||
results.append(fallback_specified)
|
||||
elif (
|
||||
(fallback_u8 and fallback_ascii is None)
|
||||
or (
|
||||
fallback_u8
|
||||
and fallback_ascii
|
||||
and fallback_u8.fingerprint != fallback_ascii.fingerprint
|
||||
)
|
||||
or (fallback_u8 is not None)
|
||||
):
|
||||
logger.debug("Encoding detection: utf_8 will be used as a fallback match")
|
||||
results.append(fallback_u8)
|
||||
elif fallback_ascii:
|
||||
logger.debug("Encoding detection: ascii will be used as a fallback match")
|
||||
results.append(fallback_ascii)
|
||||
|
||||
if results:
|
||||
logger.debug(
|
||||
"Encoding detection: Found %s as plausible (best-candidate) for content. With %i alternatives.",
|
||||
results.best().encoding, # type: ignore
|
||||
len(results) - 1,
|
||||
)
|
||||
else:
|
||||
logger.debug("Encoding detection: Unable to determine any suitable charset.")
|
||||
|
||||
if explain:
|
||||
logger.removeHandler(explain_handler)
|
||||
logger.setLevel(previous_logger_level)
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def from_fp(
|
||||
fp: BinaryIO,
|
||||
steps: int = 5,
|
||||
chunk_size: int = 512,
|
||||
threshold: float = 0.20,
|
||||
cp_isolation: Optional[List[str]] = None,
|
||||
cp_exclusion: Optional[List[str]] = None,
|
||||
preemptive_behaviour: bool = True,
|
||||
explain: bool = False,
|
||||
) -> CharsetMatches:
|
||||
"""
|
||||
Same thing than the function from_bytes but using a file pointer that is already ready.
|
||||
Will not close the file pointer.
|
||||
"""
|
||||
return from_bytes(
|
||||
fp.read(),
|
||||
steps,
|
||||
chunk_size,
|
||||
threshold,
|
||||
cp_isolation,
|
||||
cp_exclusion,
|
||||
preemptive_behaviour,
|
||||
explain,
|
||||
)
|
||||
|
||||
|
||||
def from_path(
|
||||
path: "PathLike[Any]",
|
||||
steps: int = 5,
|
||||
chunk_size: int = 512,
|
||||
threshold: float = 0.20,
|
||||
cp_isolation: Optional[List[str]] = None,
|
||||
cp_exclusion: Optional[List[str]] = None,
|
||||
preemptive_behaviour: bool = True,
|
||||
explain: bool = False,
|
||||
) -> CharsetMatches:
|
||||
"""
|
||||
Same thing than the function from_bytes but with one extra step. Opening and reading given file path in binary mode.
|
||||
Can raise IOError.
|
||||
"""
|
||||
with open(path, "rb") as fp:
|
||||
return from_fp(
|
||||
fp,
|
||||
steps,
|
||||
chunk_size,
|
||||
threshold,
|
||||
cp_isolation,
|
||||
cp_exclusion,
|
||||
preemptive_behaviour,
|
||||
explain,
|
||||
)
|
||||
|
||||
|
||||
def normalize(
|
||||
path: "PathLike[Any]",
|
||||
steps: int = 5,
|
||||
chunk_size: int = 512,
|
||||
threshold: float = 0.20,
|
||||
cp_isolation: Optional[List[str]] = None,
|
||||
cp_exclusion: Optional[List[str]] = None,
|
||||
preemptive_behaviour: bool = True,
|
||||
) -> CharsetMatch:
|
||||
"""
|
||||
Take a (text-based) file path and try to create another file next to it, this time using UTF-8.
|
||||
"""
|
||||
warnings.warn(
|
||||
"normalize is deprecated and will be removed in 3.0",
|
||||
DeprecationWarning,
|
||||
)
|
||||
|
||||
results = from_path(
|
||||
path,
|
||||
steps,
|
||||
chunk_size,
|
||||
threshold,
|
||||
cp_isolation,
|
||||
cp_exclusion,
|
||||
preemptive_behaviour,
|
||||
)
|
||||
|
||||
filename = basename(path)
|
||||
target_extensions = list(splitext(filename))
|
||||
|
||||
if len(results) == 0:
|
||||
raise IOError(
|
||||
'Unable to normalize "{}", no encoding charset seems to fit.'.format(
|
||||
filename
|
||||
)
|
||||
)
|
||||
|
||||
result = results.best()
|
||||
|
||||
target_extensions[0] += "-" + result.encoding # type: ignore
|
||||
|
||||
with open(
|
||||
"{}".format(str(path).replace(filename, "".join(target_extensions))), "wb"
|
||||
) as fp:
|
||||
fp.write(result.output()) # type: ignore
|
||||
|
||||
return result # type: ignore
|
||||
File diff suppressed because it is too large
Load Diff
Binary file not shown.
339
venv/lib/python3.11/site-packages/charset_normalizer/cd.py
Normal file
339
venv/lib/python3.11/site-packages/charset_normalizer/cd.py
Normal file
@ -0,0 +1,339 @@
|
||||
import importlib
|
||||
from codecs import IncrementalDecoder
|
||||
from collections import Counter
|
||||
from functools import lru_cache
|
||||
from typing import Counter as TypeCounter, Dict, List, Optional, Tuple
|
||||
|
||||
from .assets import FREQUENCIES
|
||||
from .constant import KO_NAMES, LANGUAGE_SUPPORTED_COUNT, TOO_SMALL_SEQUENCE, ZH_NAMES
|
||||
from .md import is_suspiciously_successive_range
|
||||
from .models import CoherenceMatches
|
||||
from .utils import (
|
||||
is_accentuated,
|
||||
is_latin,
|
||||
is_multi_byte_encoding,
|
||||
is_unicode_range_secondary,
|
||||
unicode_range,
|
||||
)
|
||||
|
||||
|
||||
def encoding_unicode_range(iana_name: str) -> List[str]:
|
||||
"""
|
||||
Return associated unicode ranges in a single byte code page.
|
||||
"""
|
||||
if is_multi_byte_encoding(iana_name):
|
||||
raise IOError("Function not supported on multi-byte code page")
|
||||
|
||||
decoder = importlib.import_module(
|
||||
"encodings.{}".format(iana_name)
|
||||
).IncrementalDecoder
|
||||
|
||||
p: IncrementalDecoder = decoder(errors="ignore")
|
||||
seen_ranges: Dict[str, int] = {}
|
||||
character_count: int = 0
|
||||
|
||||
for i in range(0x40, 0xFF):
|
||||
chunk: str = p.decode(bytes([i]))
|
||||
|
||||
if chunk:
|
||||
character_range: Optional[str] = unicode_range(chunk)
|
||||
|
||||
if character_range is None:
|
||||
continue
|
||||
|
||||
if is_unicode_range_secondary(character_range) is False:
|
||||
if character_range not in seen_ranges:
|
||||
seen_ranges[character_range] = 0
|
||||
seen_ranges[character_range] += 1
|
||||
character_count += 1
|
||||
|
||||
return sorted(
|
||||
[
|
||||
character_range
|
||||
for character_range in seen_ranges
|
||||
if seen_ranges[character_range] / character_count >= 0.15
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
def unicode_range_languages(primary_range: str) -> List[str]:
|
||||
"""
|
||||
Return inferred languages used with a unicode range.
|
||||
"""
|
||||
languages: List[str] = []
|
||||
|
||||
for language, characters in FREQUENCIES.items():
|
||||
for character in characters:
|
||||
if unicode_range(character) == primary_range:
|
||||
languages.append(language)
|
||||
break
|
||||
|
||||
return languages
|
||||
|
||||
|
||||
@lru_cache()
|
||||
def encoding_languages(iana_name: str) -> List[str]:
|
||||
"""
|
||||
Single-byte encoding language association. Some code page are heavily linked to particular language(s).
|
||||
This function does the correspondence.
|
||||
"""
|
||||
unicode_ranges: List[str] = encoding_unicode_range(iana_name)
|
||||
primary_range: Optional[str] = None
|
||||
|
||||
for specified_range in unicode_ranges:
|
||||
if "Latin" not in specified_range:
|
||||
primary_range = specified_range
|
||||
break
|
||||
|
||||
if primary_range is None:
|
||||
return ["Latin Based"]
|
||||
|
||||
return unicode_range_languages(primary_range)
|
||||
|
||||
|
||||
@lru_cache()
|
||||
def mb_encoding_languages(iana_name: str) -> List[str]:
|
||||
"""
|
||||
Multi-byte encoding language association. Some code page are heavily linked to particular language(s).
|
||||
This function does the correspondence.
|
||||
"""
|
||||
if (
|
||||
iana_name.startswith("shift_")
|
||||
or iana_name.startswith("iso2022_jp")
|
||||
or iana_name.startswith("euc_j")
|
||||
or iana_name == "cp932"
|
||||
):
|
||||
return ["Japanese"]
|
||||
if iana_name.startswith("gb") or iana_name in ZH_NAMES:
|
||||
return ["Chinese", "Classical Chinese"]
|
||||
if iana_name.startswith("iso2022_kr") or iana_name in KO_NAMES:
|
||||
return ["Korean"]
|
||||
|
||||
return []
|
||||
|
||||
|
||||
@lru_cache(maxsize=LANGUAGE_SUPPORTED_COUNT)
|
||||
def get_target_features(language: str) -> Tuple[bool, bool]:
|
||||
"""
|
||||
Determine main aspects from a supported language if it contains accents and if is pure Latin.
|
||||
"""
|
||||
target_have_accents: bool = False
|
||||
target_pure_latin: bool = True
|
||||
|
||||
for character in FREQUENCIES[language]:
|
||||
if not target_have_accents and is_accentuated(character):
|
||||
target_have_accents = True
|
||||
if target_pure_latin and is_latin(character) is False:
|
||||
target_pure_latin = False
|
||||
|
||||
return target_have_accents, target_pure_latin
|
||||
|
||||
|
||||
def alphabet_languages(
|
||||
characters: List[str], ignore_non_latin: bool = False
|
||||
) -> List[str]:
|
||||
"""
|
||||
Return associated languages associated to given characters.
|
||||
"""
|
||||
languages: List[Tuple[str, float]] = []
|
||||
|
||||
source_have_accents = any(is_accentuated(character) for character in characters)
|
||||
|
||||
for language, language_characters in FREQUENCIES.items():
|
||||
|
||||
target_have_accents, target_pure_latin = get_target_features(language)
|
||||
|
||||
if ignore_non_latin and target_pure_latin is False:
|
||||
continue
|
||||
|
||||
if target_have_accents is False and source_have_accents:
|
||||
continue
|
||||
|
||||
character_count: int = len(language_characters)
|
||||
|
||||
character_match_count: int = len(
|
||||
[c for c in language_characters if c in characters]
|
||||
)
|
||||
|
||||
ratio: float = character_match_count / character_count
|
||||
|
||||
if ratio >= 0.2:
|
||||
languages.append((language, ratio))
|
||||
|
||||
languages = sorted(languages, key=lambda x: x[1], reverse=True)
|
||||
|
||||
return [compatible_language[0] for compatible_language in languages]
|
||||
|
||||
|
||||
def characters_popularity_compare(
|
||||
language: str, ordered_characters: List[str]
|
||||
) -> float:
|
||||
"""
|
||||
Determine if a ordered characters list (by occurrence from most appearance to rarest) match a particular language.
|
||||
The result is a ratio between 0. (absolutely no correspondence) and 1. (near perfect fit).
|
||||
Beware that is function is not strict on the match in order to ease the detection. (Meaning close match is 1.)
|
||||
"""
|
||||
if language not in FREQUENCIES:
|
||||
raise ValueError("{} not available".format(language))
|
||||
|
||||
character_approved_count: int = 0
|
||||
FREQUENCIES_language_set = set(FREQUENCIES[language])
|
||||
|
||||
for character in ordered_characters:
|
||||
if character not in FREQUENCIES_language_set:
|
||||
continue
|
||||
|
||||
characters_before_source: List[str] = FREQUENCIES[language][
|
||||
0 : FREQUENCIES[language].index(character)
|
||||
]
|
||||
characters_after_source: List[str] = FREQUENCIES[language][
|
||||
FREQUENCIES[language].index(character) :
|
||||
]
|
||||
characters_before: List[str] = ordered_characters[
|
||||
0 : ordered_characters.index(character)
|
||||
]
|
||||
characters_after: List[str] = ordered_characters[
|
||||
ordered_characters.index(character) :
|
||||
]
|
||||
|
||||
before_match_count: int = len(
|
||||
set(characters_before) & set(characters_before_source)
|
||||
)
|
||||
|
||||
after_match_count: int = len(
|
||||
set(characters_after) & set(characters_after_source)
|
||||
)
|
||||
|
||||
if len(characters_before_source) == 0 and before_match_count <= 4:
|
||||
character_approved_count += 1
|
||||
continue
|
||||
|
||||
if len(characters_after_source) == 0 and after_match_count <= 4:
|
||||
character_approved_count += 1
|
||||
continue
|
||||
|
||||
if (
|
||||
before_match_count / len(characters_before_source) >= 0.4
|
||||
or after_match_count / len(characters_after_source) >= 0.4
|
||||
):
|
||||
character_approved_count += 1
|
||||
continue
|
||||
|
||||
return character_approved_count / len(ordered_characters)
|
||||
|
||||
|
||||
def alpha_unicode_split(decoded_sequence: str) -> List[str]:
|
||||
"""
|
||||
Given a decoded text sequence, return a list of str. Unicode range / alphabet separation.
|
||||
Ex. a text containing English/Latin with a bit a Hebrew will return two items in the resulting list;
|
||||
One containing the latin letters and the other hebrew.
|
||||
"""
|
||||
layers: Dict[str, str] = {}
|
||||
|
||||
for character in decoded_sequence:
|
||||
if character.isalpha() is False:
|
||||
continue
|
||||
|
||||
character_range: Optional[str] = unicode_range(character)
|
||||
|
||||
if character_range is None:
|
||||
continue
|
||||
|
||||
layer_target_range: Optional[str] = None
|
||||
|
||||
for discovered_range in layers:
|
||||
if (
|
||||
is_suspiciously_successive_range(discovered_range, character_range)
|
||||
is False
|
||||
):
|
||||
layer_target_range = discovered_range
|
||||
break
|
||||
|
||||
if layer_target_range is None:
|
||||
layer_target_range = character_range
|
||||
|
||||
if layer_target_range not in layers:
|
||||
layers[layer_target_range] = character.lower()
|
||||
continue
|
||||
|
||||
layers[layer_target_range] += character.lower()
|
||||
|
||||
return list(layers.values())
|
||||
|
||||
|
||||
def merge_coherence_ratios(results: List[CoherenceMatches]) -> CoherenceMatches:
|
||||
"""
|
||||
This function merge results previously given by the function coherence_ratio.
|
||||
The return type is the same as coherence_ratio.
|
||||
"""
|
||||
per_language_ratios: Dict[str, List[float]] = {}
|
||||
for result in results:
|
||||
for sub_result in result:
|
||||
language, ratio = sub_result
|
||||
if language not in per_language_ratios:
|
||||
per_language_ratios[language] = [ratio]
|
||||
continue
|
||||
per_language_ratios[language].append(ratio)
|
||||
|
||||
merge = [
|
||||
(
|
||||
language,
|
||||
round(
|
||||
sum(per_language_ratios[language]) / len(per_language_ratios[language]),
|
||||
4,
|
||||
),
|
||||
)
|
||||
for language in per_language_ratios
|
||||
]
|
||||
|
||||
return sorted(merge, key=lambda x: x[1], reverse=True)
|
||||
|
||||
|
||||
@lru_cache(maxsize=2048)
|
||||
def coherence_ratio(
|
||||
decoded_sequence: str, threshold: float = 0.1, lg_inclusion: Optional[str] = None
|
||||
) -> CoherenceMatches:
|
||||
"""
|
||||
Detect ANY language that can be identified in given sequence. The sequence will be analysed by layers.
|
||||
A layer = Character extraction by alphabets/ranges.
|
||||
"""
|
||||
|
||||
results: List[Tuple[str, float]] = []
|
||||
ignore_non_latin: bool = False
|
||||
|
||||
sufficient_match_count: int = 0
|
||||
|
||||
lg_inclusion_list = lg_inclusion.split(",") if lg_inclusion is not None else []
|
||||
if "Latin Based" in lg_inclusion_list:
|
||||
ignore_non_latin = True
|
||||
lg_inclusion_list.remove("Latin Based")
|
||||
|
||||
for layer in alpha_unicode_split(decoded_sequence):
|
||||
sequence_frequencies: TypeCounter[str] = Counter(layer)
|
||||
most_common = sequence_frequencies.most_common()
|
||||
|
||||
character_count: int = sum(o for c, o in most_common)
|
||||
|
||||
if character_count <= TOO_SMALL_SEQUENCE:
|
||||
continue
|
||||
|
||||
popular_character_ordered: List[str] = [c for c, o in most_common]
|
||||
|
||||
for language in lg_inclusion_list or alphabet_languages(
|
||||
popular_character_ordered, ignore_non_latin
|
||||
):
|
||||
ratio: float = characters_popularity_compare(
|
||||
language, popular_character_ordered
|
||||
)
|
||||
|
||||
if ratio < threshold:
|
||||
continue
|
||||
elif ratio >= 0.8:
|
||||
sufficient_match_count += 1
|
||||
|
||||
results.append((language, round(ratio, 4)))
|
||||
|
||||
if sufficient_match_count >= 3:
|
||||
break
|
||||
|
||||
return sorted(results, key=lambda x: x[1], reverse=True)
|
||||
Binary file not shown.
Binary file not shown.
@ -0,0 +1,295 @@
|
||||
import argparse
|
||||
import sys
|
||||
from json import dumps
|
||||
from os.path import abspath
|
||||
from platform import python_version
|
||||
from typing import List, Optional
|
||||
|
||||
try:
|
||||
from unicodedata2 import unidata_version
|
||||
except ImportError:
|
||||
from unicodedata import unidata_version
|
||||
|
||||
from charset_normalizer import from_fp
|
||||
from charset_normalizer.models import CliDetectionResult
|
||||
from charset_normalizer.version import __version__
|
||||
|
||||
|
||||
def query_yes_no(question: str, default: str = "yes") -> bool:
|
||||
"""Ask a yes/no question via input() and return their answer.
|
||||
|
||||
"question" is a string that is presented to the user.
|
||||
"default" is the presumed answer if the user just hits <Enter>.
|
||||
It must be "yes" (the default), "no" or None (meaning
|
||||
an answer is required of the user).
|
||||
|
||||
The "answer" return value is True for "yes" or False for "no".
|
||||
|
||||
Credit goes to (c) https://stackoverflow.com/questions/3041986/apt-command-line-interface-like-yes-no-input
|
||||
"""
|
||||
valid = {"yes": True, "y": True, "ye": True, "no": False, "n": False}
|
||||
if default is None:
|
||||
prompt = " [y/n] "
|
||||
elif default == "yes":
|
||||
prompt = " [Y/n] "
|
||||
elif default == "no":
|
||||
prompt = " [y/N] "
|
||||
else:
|
||||
raise ValueError("invalid default answer: '%s'" % default)
|
||||
|
||||
while True:
|
||||
sys.stdout.write(question + prompt)
|
||||
choice = input().lower()
|
||||
if default is not None and choice == "":
|
||||
return valid[default]
|
||||
elif choice in valid:
|
||||
return valid[choice]
|
||||
else:
|
||||
sys.stdout.write("Please respond with 'yes' or 'no' " "(or 'y' or 'n').\n")
|
||||
|
||||
|
||||
def cli_detect(argv: Optional[List[str]] = None) -> int:
|
||||
"""
|
||||
CLI assistant using ARGV and ArgumentParser
|
||||
:param argv:
|
||||
:return: 0 if everything is fine, anything else equal trouble
|
||||
"""
|
||||
parser = argparse.ArgumentParser(
|
||||
description="The Real First Universal Charset Detector. "
|
||||
"Discover originating encoding used on text file. "
|
||||
"Normalize text to unicode."
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"files", type=argparse.FileType("rb"), nargs="+", help="File(s) to be analysed"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-v",
|
||||
"--verbose",
|
||||
action="store_true",
|
||||
default=False,
|
||||
dest="verbose",
|
||||
help="Display complementary information about file if any. "
|
||||
"Stdout will contain logs about the detection process.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-a",
|
||||
"--with-alternative",
|
||||
action="store_true",
|
||||
default=False,
|
||||
dest="alternatives",
|
||||
help="Output complementary possibilities if any. Top-level JSON WILL be a list.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-n",
|
||||
"--normalize",
|
||||
action="store_true",
|
||||
default=False,
|
||||
dest="normalize",
|
||||
help="Permit to normalize input file. If not set, program does not write anything.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-m",
|
||||
"--minimal",
|
||||
action="store_true",
|
||||
default=False,
|
||||
dest="minimal",
|
||||
help="Only output the charset detected to STDOUT. Disabling JSON output.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-r",
|
||||
"--replace",
|
||||
action="store_true",
|
||||
default=False,
|
||||
dest="replace",
|
||||
help="Replace file when trying to normalize it instead of creating a new one.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-f",
|
||||
"--force",
|
||||
action="store_true",
|
||||
default=False,
|
||||
dest="force",
|
||||
help="Replace file without asking if you are sure, use this flag with caution.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-t",
|
||||
"--threshold",
|
||||
action="store",
|
||||
default=0.2,
|
||||
type=float,
|
||||
dest="threshold",
|
||||
help="Define a custom maximum amount of chaos allowed in decoded content. 0. <= chaos <= 1.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--version",
|
||||
action="version",
|
||||
version="Charset-Normalizer {} - Python {} - Unicode {}".format(
|
||||
__version__, python_version(), unidata_version
|
||||
),
|
||||
help="Show version information and exit.",
|
||||
)
|
||||
|
||||
args = parser.parse_args(argv)
|
||||
|
||||
if args.replace is True and args.normalize is False:
|
||||
print("Use --replace in addition of --normalize only.", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
if args.force is True and args.replace is False:
|
||||
print("Use --force in addition of --replace only.", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
if args.threshold < 0.0 or args.threshold > 1.0:
|
||||
print("--threshold VALUE should be between 0. AND 1.", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
x_ = []
|
||||
|
||||
for my_file in args.files:
|
||||
|
||||
matches = from_fp(my_file, threshold=args.threshold, explain=args.verbose)
|
||||
|
||||
best_guess = matches.best()
|
||||
|
||||
if best_guess is None:
|
||||
print(
|
||||
'Unable to identify originating encoding for "{}". {}'.format(
|
||||
my_file.name,
|
||||
"Maybe try increasing maximum amount of chaos."
|
||||
if args.threshold < 1.0
|
||||
else "",
|
||||
),
|
||||
file=sys.stderr,
|
||||
)
|
||||
x_.append(
|
||||
CliDetectionResult(
|
||||
abspath(my_file.name),
|
||||
None,
|
||||
[],
|
||||
[],
|
||||
"Unknown",
|
||||
[],
|
||||
False,
|
||||
1.0,
|
||||
0.0,
|
||||
None,
|
||||
True,
|
||||
)
|
||||
)
|
||||
else:
|
||||
x_.append(
|
||||
CliDetectionResult(
|
||||
abspath(my_file.name),
|
||||
best_guess.encoding,
|
||||
best_guess.encoding_aliases,
|
||||
[
|
||||
cp
|
||||
for cp in best_guess.could_be_from_charset
|
||||
if cp != best_guess.encoding
|
||||
],
|
||||
best_guess.language,
|
||||
best_guess.alphabets,
|
||||
best_guess.bom,
|
||||
best_guess.percent_chaos,
|
||||
best_guess.percent_coherence,
|
||||
None,
|
||||
True,
|
||||
)
|
||||
)
|
||||
|
||||
if len(matches) > 1 and args.alternatives:
|
||||
for el in matches:
|
||||
if el != best_guess:
|
||||
x_.append(
|
||||
CliDetectionResult(
|
||||
abspath(my_file.name),
|
||||
el.encoding,
|
||||
el.encoding_aliases,
|
||||
[
|
||||
cp
|
||||
for cp in el.could_be_from_charset
|
||||
if cp != el.encoding
|
||||
],
|
||||
el.language,
|
||||
el.alphabets,
|
||||
el.bom,
|
||||
el.percent_chaos,
|
||||
el.percent_coherence,
|
||||
None,
|
||||
False,
|
||||
)
|
||||
)
|
||||
|
||||
if args.normalize is True:
|
||||
|
||||
if best_guess.encoding.startswith("utf") is True:
|
||||
print(
|
||||
'"{}" file does not need to be normalized, as it already came from unicode.'.format(
|
||||
my_file.name
|
||||
),
|
||||
file=sys.stderr,
|
||||
)
|
||||
if my_file.closed is False:
|
||||
my_file.close()
|
||||
continue
|
||||
|
||||
o_: List[str] = my_file.name.split(".")
|
||||
|
||||
if args.replace is False:
|
||||
o_.insert(-1, best_guess.encoding)
|
||||
if my_file.closed is False:
|
||||
my_file.close()
|
||||
elif (
|
||||
args.force is False
|
||||
and query_yes_no(
|
||||
'Are you sure to normalize "{}" by replacing it ?'.format(
|
||||
my_file.name
|
||||
),
|
||||
"no",
|
||||
)
|
||||
is False
|
||||
):
|
||||
if my_file.closed is False:
|
||||
my_file.close()
|
||||
continue
|
||||
|
||||
try:
|
||||
x_[0].unicode_path = abspath("./{}".format(".".join(o_)))
|
||||
|
||||
with open(x_[0].unicode_path, "w", encoding="utf-8") as fp:
|
||||
fp.write(str(best_guess))
|
||||
except IOError as e:
|
||||
print(str(e), file=sys.stderr)
|
||||
if my_file.closed is False:
|
||||
my_file.close()
|
||||
return 2
|
||||
|
||||
if my_file.closed is False:
|
||||
my_file.close()
|
||||
|
||||
if args.minimal is False:
|
||||
print(
|
||||
dumps(
|
||||
[el.__dict__ for el in x_] if len(x_) > 1 else x_[0].__dict__,
|
||||
ensure_ascii=True,
|
||||
indent=4,
|
||||
)
|
||||
)
|
||||
else:
|
||||
for my_file in args.files:
|
||||
print(
|
||||
", ".join(
|
||||
[
|
||||
el.encoding or "undefined"
|
||||
for el in x_
|
||||
if el.path == abspath(my_file.name)
|
||||
]
|
||||
)
|
||||
)
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
cli_detect()
|
||||
497
venv/lib/python3.11/site-packages/charset_normalizer/constant.py
Normal file
497
venv/lib/python3.11/site-packages/charset_normalizer/constant.py
Normal file
@ -0,0 +1,497 @@
|
||||
from codecs import BOM_UTF8, BOM_UTF16_BE, BOM_UTF16_LE, BOM_UTF32_BE, BOM_UTF32_LE
|
||||
from encodings.aliases import aliases
|
||||
from re import IGNORECASE, compile as re_compile
|
||||
from typing import Dict, List, Set, Union
|
||||
|
||||
from .assets import FREQUENCIES
|
||||
|
||||
# Contain for each eligible encoding a list of/item bytes SIG/BOM
|
||||
ENCODING_MARKS: Dict[str, Union[bytes, List[bytes]]] = {
|
||||
"utf_8": BOM_UTF8,
|
||||
"utf_7": [
|
||||
b"\x2b\x2f\x76\x38",
|
||||
b"\x2b\x2f\x76\x39",
|
||||
b"\x2b\x2f\x76\x2b",
|
||||
b"\x2b\x2f\x76\x2f",
|
||||
b"\x2b\x2f\x76\x38\x2d",
|
||||
],
|
||||
"gb18030": b"\x84\x31\x95\x33",
|
||||
"utf_32": [BOM_UTF32_BE, BOM_UTF32_LE],
|
||||
"utf_16": [BOM_UTF16_BE, BOM_UTF16_LE],
|
||||
}
|
||||
|
||||
TOO_SMALL_SEQUENCE: int = 32
|
||||
TOO_BIG_SEQUENCE: int = int(10e6)
|
||||
|
||||
UTF8_MAXIMAL_ALLOCATION: int = 1112064
|
||||
|
||||
UNICODE_RANGES_COMBINED: Dict[str, range] = {
|
||||
"Control character": range(31 + 1),
|
||||
"Basic Latin": range(32, 127 + 1),
|
||||
"Latin-1 Supplement": range(128, 255 + 1),
|
||||
"Latin Extended-A": range(256, 383 + 1),
|
||||
"Latin Extended-B": range(384, 591 + 1),
|
||||
"IPA Extensions": range(592, 687 + 1),
|
||||
"Spacing Modifier Letters": range(688, 767 + 1),
|
||||
"Combining Diacritical Marks": range(768, 879 + 1),
|
||||
"Greek and Coptic": range(880, 1023 + 1),
|
||||
"Cyrillic": range(1024, 1279 + 1),
|
||||
"Cyrillic Supplement": range(1280, 1327 + 1),
|
||||
"Armenian": range(1328, 1423 + 1),
|
||||
"Hebrew": range(1424, 1535 + 1),
|
||||
"Arabic": range(1536, 1791 + 1),
|
||||
"Syriac": range(1792, 1871 + 1),
|
||||
"Arabic Supplement": range(1872, 1919 + 1),
|
||||
"Thaana": range(1920, 1983 + 1),
|
||||
"NKo": range(1984, 2047 + 1),
|
||||
"Samaritan": range(2048, 2111 + 1),
|
||||
"Mandaic": range(2112, 2143 + 1),
|
||||
"Syriac Supplement": range(2144, 2159 + 1),
|
||||
"Arabic Extended-A": range(2208, 2303 + 1),
|
||||
"Devanagari": range(2304, 2431 + 1),
|
||||
"Bengali": range(2432, 2559 + 1),
|
||||
"Gurmukhi": range(2560, 2687 + 1),
|
||||
"Gujarati": range(2688, 2815 + 1),
|
||||
"Oriya": range(2816, 2943 + 1),
|
||||
"Tamil": range(2944, 3071 + 1),
|
||||
"Telugu": range(3072, 3199 + 1),
|
||||
"Kannada": range(3200, 3327 + 1),
|
||||
"Malayalam": range(3328, 3455 + 1),
|
||||
"Sinhala": range(3456, 3583 + 1),
|
||||
"Thai": range(3584, 3711 + 1),
|
||||
"Lao": range(3712, 3839 + 1),
|
||||
"Tibetan": range(3840, 4095 + 1),
|
||||
"Myanmar": range(4096, 4255 + 1),
|
||||
"Georgian": range(4256, 4351 + 1),
|
||||
"Hangul Jamo": range(4352, 4607 + 1),
|
||||
"Ethiopic": range(4608, 4991 + 1),
|
||||
"Ethiopic Supplement": range(4992, 5023 + 1),
|
||||
"Cherokee": range(5024, 5119 + 1),
|
||||
"Unified Canadian Aboriginal Syllabics": range(5120, 5759 + 1),
|
||||
"Ogham": range(5760, 5791 + 1),
|
||||
"Runic": range(5792, 5887 + 1),
|
||||
"Tagalog": range(5888, 5919 + 1),
|
||||
"Hanunoo": range(5920, 5951 + 1),
|
||||
"Buhid": range(5952, 5983 + 1),
|
||||
"Tagbanwa": range(5984, 6015 + 1),
|
||||
"Khmer": range(6016, 6143 + 1),
|
||||
"Mongolian": range(6144, 6319 + 1),
|
||||
"Unified Canadian Aboriginal Syllabics Extended": range(6320, 6399 + 1),
|
||||
"Limbu": range(6400, 6479 + 1),
|
||||
"Tai Le": range(6480, 6527 + 1),
|
||||
"New Tai Lue": range(6528, 6623 + 1),
|
||||
"Khmer Symbols": range(6624, 6655 + 1),
|
||||
"Buginese": range(6656, 6687 + 1),
|
||||
"Tai Tham": range(6688, 6831 + 1),
|
||||
"Combining Diacritical Marks Extended": range(6832, 6911 + 1),
|
||||
"Balinese": range(6912, 7039 + 1),
|
||||
"Sundanese": range(7040, 7103 + 1),
|
||||
"Batak": range(7104, 7167 + 1),
|
||||
"Lepcha": range(7168, 7247 + 1),
|
||||
"Ol Chiki": range(7248, 7295 + 1),
|
||||
"Cyrillic Extended C": range(7296, 7311 + 1),
|
||||
"Sundanese Supplement": range(7360, 7375 + 1),
|
||||
"Vedic Extensions": range(7376, 7423 + 1),
|
||||
"Phonetic Extensions": range(7424, 7551 + 1),
|
||||
"Phonetic Extensions Supplement": range(7552, 7615 + 1),
|
||||
"Combining Diacritical Marks Supplement": range(7616, 7679 + 1),
|
||||
"Latin Extended Additional": range(7680, 7935 + 1),
|
||||
"Greek Extended": range(7936, 8191 + 1),
|
||||
"General Punctuation": range(8192, 8303 + 1),
|
||||
"Superscripts and Subscripts": range(8304, 8351 + 1),
|
||||
"Currency Symbols": range(8352, 8399 + 1),
|
||||
"Combining Diacritical Marks for Symbols": range(8400, 8447 + 1),
|
||||
"Letterlike Symbols": range(8448, 8527 + 1),
|
||||
"Number Forms": range(8528, 8591 + 1),
|
||||
"Arrows": range(8592, 8703 + 1),
|
||||
"Mathematical Operators": range(8704, 8959 + 1),
|
||||
"Miscellaneous Technical": range(8960, 9215 + 1),
|
||||
"Control Pictures": range(9216, 9279 + 1),
|
||||
"Optical Character Recognition": range(9280, 9311 + 1),
|
||||
"Enclosed Alphanumerics": range(9312, 9471 + 1),
|
||||
"Box Drawing": range(9472, 9599 + 1),
|
||||
"Block Elements": range(9600, 9631 + 1),
|
||||
"Geometric Shapes": range(9632, 9727 + 1),
|
||||
"Miscellaneous Symbols": range(9728, 9983 + 1),
|
||||
"Dingbats": range(9984, 10175 + 1),
|
||||
"Miscellaneous Mathematical Symbols-A": range(10176, 10223 + 1),
|
||||
"Supplemental Arrows-A": range(10224, 10239 + 1),
|
||||
"Braille Patterns": range(10240, 10495 + 1),
|
||||
"Supplemental Arrows-B": range(10496, 10623 + 1),
|
||||
"Miscellaneous Mathematical Symbols-B": range(10624, 10751 + 1),
|
||||
"Supplemental Mathematical Operators": range(10752, 11007 + 1),
|
||||
"Miscellaneous Symbols and Arrows": range(11008, 11263 + 1),
|
||||
"Glagolitic": range(11264, 11359 + 1),
|
||||
"Latin Extended-C": range(11360, 11391 + 1),
|
||||
"Coptic": range(11392, 11519 + 1),
|
||||
"Georgian Supplement": range(11520, 11567 + 1),
|
||||
"Tifinagh": range(11568, 11647 + 1),
|
||||
"Ethiopic Extended": range(11648, 11743 + 1),
|
||||
"Cyrillic Extended-A": range(11744, 11775 + 1),
|
||||
"Supplemental Punctuation": range(11776, 11903 + 1),
|
||||
"CJK Radicals Supplement": range(11904, 12031 + 1),
|
||||
"Kangxi Radicals": range(12032, 12255 + 1),
|
||||
"Ideographic Description Characters": range(12272, 12287 + 1),
|
||||
"CJK Symbols and Punctuation": range(12288, 12351 + 1),
|
||||
"Hiragana": range(12352, 12447 + 1),
|
||||
"Katakana": range(12448, 12543 + 1),
|
||||
"Bopomofo": range(12544, 12591 + 1),
|
||||
"Hangul Compatibility Jamo": range(12592, 12687 + 1),
|
||||
"Kanbun": range(12688, 12703 + 1),
|
||||
"Bopomofo Extended": range(12704, 12735 + 1),
|
||||
"CJK Strokes": range(12736, 12783 + 1),
|
||||
"Katakana Phonetic Extensions": range(12784, 12799 + 1),
|
||||
"Enclosed CJK Letters and Months": range(12800, 13055 + 1),
|
||||
"CJK Compatibility": range(13056, 13311 + 1),
|
||||
"CJK Unified Ideographs Extension A": range(13312, 19903 + 1),
|
||||
"Yijing Hexagram Symbols": range(19904, 19967 + 1),
|
||||
"CJK Unified Ideographs": range(19968, 40959 + 1),
|
||||
"Yi Syllables": range(40960, 42127 + 1),
|
||||
"Yi Radicals": range(42128, 42191 + 1),
|
||||
"Lisu": range(42192, 42239 + 1),
|
||||
"Vai": range(42240, 42559 + 1),
|
||||
"Cyrillic Extended-B": range(42560, 42655 + 1),
|
||||
"Bamum": range(42656, 42751 + 1),
|
||||
"Modifier Tone Letters": range(42752, 42783 + 1),
|
||||
"Latin Extended-D": range(42784, 43007 + 1),
|
||||
"Syloti Nagri": range(43008, 43055 + 1),
|
||||
"Common Indic Number Forms": range(43056, 43071 + 1),
|
||||
"Phags-pa": range(43072, 43135 + 1),
|
||||
"Saurashtra": range(43136, 43231 + 1),
|
||||
"Devanagari Extended": range(43232, 43263 + 1),
|
||||
"Kayah Li": range(43264, 43311 + 1),
|
||||
"Rejang": range(43312, 43359 + 1),
|
||||
"Hangul Jamo Extended-A": range(43360, 43391 + 1),
|
||||
"Javanese": range(43392, 43487 + 1),
|
||||
"Myanmar Extended-B": range(43488, 43519 + 1),
|
||||
"Cham": range(43520, 43615 + 1),
|
||||
"Myanmar Extended-A": range(43616, 43647 + 1),
|
||||
"Tai Viet": range(43648, 43743 + 1),
|
||||
"Meetei Mayek Extensions": range(43744, 43775 + 1),
|
||||
"Ethiopic Extended-A": range(43776, 43823 + 1),
|
||||
"Latin Extended-E": range(43824, 43887 + 1),
|
||||
"Cherokee Supplement": range(43888, 43967 + 1),
|
||||
"Meetei Mayek": range(43968, 44031 + 1),
|
||||
"Hangul Syllables": range(44032, 55215 + 1),
|
||||
"Hangul Jamo Extended-B": range(55216, 55295 + 1),
|
||||
"High Surrogates": range(55296, 56191 + 1),
|
||||
"High Private Use Surrogates": range(56192, 56319 + 1),
|
||||
"Low Surrogates": range(56320, 57343 + 1),
|
||||
"Private Use Area": range(57344, 63743 + 1),
|
||||
"CJK Compatibility Ideographs": range(63744, 64255 + 1),
|
||||
"Alphabetic Presentation Forms": range(64256, 64335 + 1),
|
||||
"Arabic Presentation Forms-A": range(64336, 65023 + 1),
|
||||
"Variation Selectors": range(65024, 65039 + 1),
|
||||
"Vertical Forms": range(65040, 65055 + 1),
|
||||
"Combining Half Marks": range(65056, 65071 + 1),
|
||||
"CJK Compatibility Forms": range(65072, 65103 + 1),
|
||||
"Small Form Variants": range(65104, 65135 + 1),
|
||||
"Arabic Presentation Forms-B": range(65136, 65279 + 1),
|
||||
"Halfwidth and Fullwidth Forms": range(65280, 65519 + 1),
|
||||
"Specials": range(65520, 65535 + 1),
|
||||
"Linear B Syllabary": range(65536, 65663 + 1),
|
||||
"Linear B Ideograms": range(65664, 65791 + 1),
|
||||
"Aegean Numbers": range(65792, 65855 + 1),
|
||||
"Ancient Greek Numbers": range(65856, 65935 + 1),
|
||||
"Ancient Symbols": range(65936, 65999 + 1),
|
||||
"Phaistos Disc": range(66000, 66047 + 1),
|
||||
"Lycian": range(66176, 66207 + 1),
|
||||
"Carian": range(66208, 66271 + 1),
|
||||
"Coptic Epact Numbers": range(66272, 66303 + 1),
|
||||
"Old Italic": range(66304, 66351 + 1),
|
||||
"Gothic": range(66352, 66383 + 1),
|
||||
"Old Permic": range(66384, 66431 + 1),
|
||||
"Ugaritic": range(66432, 66463 + 1),
|
||||
"Old Persian": range(66464, 66527 + 1),
|
||||
"Deseret": range(66560, 66639 + 1),
|
||||
"Shavian": range(66640, 66687 + 1),
|
||||
"Osmanya": range(66688, 66735 + 1),
|
||||
"Osage": range(66736, 66815 + 1),
|
||||
"Elbasan": range(66816, 66863 + 1),
|
||||
"Caucasian Albanian": range(66864, 66927 + 1),
|
||||
"Linear A": range(67072, 67455 + 1),
|
||||
"Cypriot Syllabary": range(67584, 67647 + 1),
|
||||
"Imperial Aramaic": range(67648, 67679 + 1),
|
||||
"Palmyrene": range(67680, 67711 + 1),
|
||||
"Nabataean": range(67712, 67759 + 1),
|
||||
"Hatran": range(67808, 67839 + 1),
|
||||
"Phoenician": range(67840, 67871 + 1),
|
||||
"Lydian": range(67872, 67903 + 1),
|
||||
"Meroitic Hieroglyphs": range(67968, 67999 + 1),
|
||||
"Meroitic Cursive": range(68000, 68095 + 1),
|
||||
"Kharoshthi": range(68096, 68191 + 1),
|
||||
"Old South Arabian": range(68192, 68223 + 1),
|
||||
"Old North Arabian": range(68224, 68255 + 1),
|
||||
"Manichaean": range(68288, 68351 + 1),
|
||||
"Avestan": range(68352, 68415 + 1),
|
||||
"Inscriptional Parthian": range(68416, 68447 + 1),
|
||||
"Inscriptional Pahlavi": range(68448, 68479 + 1),
|
||||
"Psalter Pahlavi": range(68480, 68527 + 1),
|
||||
"Old Turkic": range(68608, 68687 + 1),
|
||||
"Old Hungarian": range(68736, 68863 + 1),
|
||||
"Rumi Numeral Symbols": range(69216, 69247 + 1),
|
||||
"Brahmi": range(69632, 69759 + 1),
|
||||
"Kaithi": range(69760, 69839 + 1),
|
||||
"Sora Sompeng": range(69840, 69887 + 1),
|
||||
"Chakma": range(69888, 69967 + 1),
|
||||
"Mahajani": range(69968, 70015 + 1),
|
||||
"Sharada": range(70016, 70111 + 1),
|
||||
"Sinhala Archaic Numbers": range(70112, 70143 + 1),
|
||||
"Khojki": range(70144, 70223 + 1),
|
||||
"Multani": range(70272, 70319 + 1),
|
||||
"Khudawadi": range(70320, 70399 + 1),
|
||||
"Grantha": range(70400, 70527 + 1),
|
||||
"Newa": range(70656, 70783 + 1),
|
||||
"Tirhuta": range(70784, 70879 + 1),
|
||||
"Siddham": range(71040, 71167 + 1),
|
||||
"Modi": range(71168, 71263 + 1),
|
||||
"Mongolian Supplement": range(71264, 71295 + 1),
|
||||
"Takri": range(71296, 71375 + 1),
|
||||
"Ahom": range(71424, 71487 + 1),
|
||||
"Warang Citi": range(71840, 71935 + 1),
|
||||
"Zanabazar Square": range(72192, 72271 + 1),
|
||||
"Soyombo": range(72272, 72367 + 1),
|
||||
"Pau Cin Hau": range(72384, 72447 + 1),
|
||||
"Bhaiksuki": range(72704, 72815 + 1),
|
||||
"Marchen": range(72816, 72895 + 1),
|
||||
"Masaram Gondi": range(72960, 73055 + 1),
|
||||
"Cuneiform": range(73728, 74751 + 1),
|
||||
"Cuneiform Numbers and Punctuation": range(74752, 74879 + 1),
|
||||
"Early Dynastic Cuneiform": range(74880, 75087 + 1),
|
||||
"Egyptian Hieroglyphs": range(77824, 78895 + 1),
|
||||
"Anatolian Hieroglyphs": range(82944, 83583 + 1),
|
||||
"Bamum Supplement": range(92160, 92735 + 1),
|
||||
"Mro": range(92736, 92783 + 1),
|
||||
"Bassa Vah": range(92880, 92927 + 1),
|
||||
"Pahawh Hmong": range(92928, 93071 + 1),
|
||||
"Miao": range(93952, 94111 + 1),
|
||||
"Ideographic Symbols and Punctuation": range(94176, 94207 + 1),
|
||||
"Tangut": range(94208, 100351 + 1),
|
||||
"Tangut Components": range(100352, 101119 + 1),
|
||||
"Kana Supplement": range(110592, 110847 + 1),
|
||||
"Kana Extended-A": range(110848, 110895 + 1),
|
||||
"Nushu": range(110960, 111359 + 1),
|
||||
"Duployan": range(113664, 113823 + 1),
|
||||
"Shorthand Format Controls": range(113824, 113839 + 1),
|
||||
"Byzantine Musical Symbols": range(118784, 119039 + 1),
|
||||
"Musical Symbols": range(119040, 119295 + 1),
|
||||
"Ancient Greek Musical Notation": range(119296, 119375 + 1),
|
||||
"Tai Xuan Jing Symbols": range(119552, 119647 + 1),
|
||||
"Counting Rod Numerals": range(119648, 119679 + 1),
|
||||
"Mathematical Alphanumeric Symbols": range(119808, 120831 + 1),
|
||||
"Sutton SignWriting": range(120832, 121519 + 1),
|
||||
"Glagolitic Supplement": range(122880, 122927 + 1),
|
||||
"Mende Kikakui": range(124928, 125151 + 1),
|
||||
"Adlam": range(125184, 125279 + 1),
|
||||
"Arabic Mathematical Alphabetic Symbols": range(126464, 126719 + 1),
|
||||
"Mahjong Tiles": range(126976, 127023 + 1),
|
||||
"Domino Tiles": range(127024, 127135 + 1),
|
||||
"Playing Cards": range(127136, 127231 + 1),
|
||||
"Enclosed Alphanumeric Supplement": range(127232, 127487 + 1),
|
||||
"Enclosed Ideographic Supplement": range(127488, 127743 + 1),
|
||||
"Miscellaneous Symbols and Pictographs": range(127744, 128511 + 1),
|
||||
"Emoticons range(Emoji)": range(128512, 128591 + 1),
|
||||
"Ornamental Dingbats": range(128592, 128639 + 1),
|
||||
"Transport and Map Symbols": range(128640, 128767 + 1),
|
||||
"Alchemical Symbols": range(128768, 128895 + 1),
|
||||
"Geometric Shapes Extended": range(128896, 129023 + 1),
|
||||
"Supplemental Arrows-C": range(129024, 129279 + 1),
|
||||
"Supplemental Symbols and Pictographs": range(129280, 129535 + 1),
|
||||
"CJK Unified Ideographs Extension B": range(131072, 173791 + 1),
|
||||
"CJK Unified Ideographs Extension C": range(173824, 177983 + 1),
|
||||
"CJK Unified Ideographs Extension D": range(177984, 178207 + 1),
|
||||
"CJK Unified Ideographs Extension E": range(178208, 183983 + 1),
|
||||
"CJK Unified Ideographs Extension F": range(183984, 191471 + 1),
|
||||
"CJK Compatibility Ideographs Supplement": range(194560, 195103 + 1),
|
||||
"Tags": range(917504, 917631 + 1),
|
||||
"Variation Selectors Supplement": range(917760, 917999 + 1),
|
||||
}
|
||||
|
||||
|
||||
UNICODE_SECONDARY_RANGE_KEYWORD: List[str] = [
|
||||
"Supplement",
|
||||
"Extended",
|
||||
"Extensions",
|
||||
"Modifier",
|
||||
"Marks",
|
||||
"Punctuation",
|
||||
"Symbols",
|
||||
"Forms",
|
||||
"Operators",
|
||||
"Miscellaneous",
|
||||
"Drawing",
|
||||
"Block",
|
||||
"Shapes",
|
||||
"Supplemental",
|
||||
"Tags",
|
||||
]
|
||||
|
||||
RE_POSSIBLE_ENCODING_INDICATION = re_compile(
|
||||
r"(?:(?:encoding)|(?:charset)|(?:coding))(?:[\:= ]{1,10})(?:[\"\']?)([a-zA-Z0-9\-_]+)(?:[\"\']?)",
|
||||
IGNORECASE,
|
||||
)
|
||||
|
||||
IANA_SUPPORTED: List[str] = sorted(
|
||||
filter(
|
||||
lambda x: x.endswith("_codec") is False
|
||||
and x not in {"rot_13", "tactis", "mbcs"},
|
||||
list(set(aliases.values())),
|
||||
)
|
||||
)
|
||||
|
||||
IANA_SUPPORTED_COUNT: int = len(IANA_SUPPORTED)
|
||||
|
||||
# pre-computed code page that are similar using the function cp_similarity.
|
||||
IANA_SUPPORTED_SIMILAR: Dict[str, List[str]] = {
|
||||
"cp037": ["cp1026", "cp1140", "cp273", "cp500"],
|
||||
"cp1026": ["cp037", "cp1140", "cp273", "cp500"],
|
||||
"cp1125": ["cp866"],
|
||||
"cp1140": ["cp037", "cp1026", "cp273", "cp500"],
|
||||
"cp1250": ["iso8859_2"],
|
||||
"cp1251": ["kz1048", "ptcp154"],
|
||||
"cp1252": ["iso8859_15", "iso8859_9", "latin_1"],
|
||||
"cp1253": ["iso8859_7"],
|
||||
"cp1254": ["iso8859_15", "iso8859_9", "latin_1"],
|
||||
"cp1257": ["iso8859_13"],
|
||||
"cp273": ["cp037", "cp1026", "cp1140", "cp500"],
|
||||
"cp437": ["cp850", "cp858", "cp860", "cp861", "cp862", "cp863", "cp865"],
|
||||
"cp500": ["cp037", "cp1026", "cp1140", "cp273"],
|
||||
"cp850": ["cp437", "cp857", "cp858", "cp865"],
|
||||
"cp857": ["cp850", "cp858", "cp865"],
|
||||
"cp858": ["cp437", "cp850", "cp857", "cp865"],
|
||||
"cp860": ["cp437", "cp861", "cp862", "cp863", "cp865"],
|
||||
"cp861": ["cp437", "cp860", "cp862", "cp863", "cp865"],
|
||||
"cp862": ["cp437", "cp860", "cp861", "cp863", "cp865"],
|
||||
"cp863": ["cp437", "cp860", "cp861", "cp862", "cp865"],
|
||||
"cp865": ["cp437", "cp850", "cp857", "cp858", "cp860", "cp861", "cp862", "cp863"],
|
||||
"cp866": ["cp1125"],
|
||||
"iso8859_10": ["iso8859_14", "iso8859_15", "iso8859_4", "iso8859_9", "latin_1"],
|
||||
"iso8859_11": ["tis_620"],
|
||||
"iso8859_13": ["cp1257"],
|
||||
"iso8859_14": [
|
||||
"iso8859_10",
|
||||
"iso8859_15",
|
||||
"iso8859_16",
|
||||
"iso8859_3",
|
||||
"iso8859_9",
|
||||
"latin_1",
|
||||
],
|
||||
"iso8859_15": [
|
||||
"cp1252",
|
||||
"cp1254",
|
||||
"iso8859_10",
|
||||
"iso8859_14",
|
||||
"iso8859_16",
|
||||
"iso8859_3",
|
||||
"iso8859_9",
|
||||
"latin_1",
|
||||
],
|
||||
"iso8859_16": [
|
||||
"iso8859_14",
|
||||
"iso8859_15",
|
||||
"iso8859_2",
|
||||
"iso8859_3",
|
||||
"iso8859_9",
|
||||
"latin_1",
|
||||
],
|
||||
"iso8859_2": ["cp1250", "iso8859_16", "iso8859_4"],
|
||||
"iso8859_3": ["iso8859_14", "iso8859_15", "iso8859_16", "iso8859_9", "latin_1"],
|
||||
"iso8859_4": ["iso8859_10", "iso8859_2", "iso8859_9", "latin_1"],
|
||||
"iso8859_7": ["cp1253"],
|
||||
"iso8859_9": [
|
||||
"cp1252",
|
||||
"cp1254",
|
||||
"cp1258",
|
||||
"iso8859_10",
|
||||
"iso8859_14",
|
||||
"iso8859_15",
|
||||
"iso8859_16",
|
||||
"iso8859_3",
|
||||
"iso8859_4",
|
||||
"latin_1",
|
||||
],
|
||||
"kz1048": ["cp1251", "ptcp154"],
|
||||
"latin_1": [
|
||||
"cp1252",
|
||||
"cp1254",
|
||||
"cp1258",
|
||||
"iso8859_10",
|
||||
"iso8859_14",
|
||||
"iso8859_15",
|
||||
"iso8859_16",
|
||||
"iso8859_3",
|
||||
"iso8859_4",
|
||||
"iso8859_9",
|
||||
],
|
||||
"mac_iceland": ["mac_roman", "mac_turkish"],
|
||||
"mac_roman": ["mac_iceland", "mac_turkish"],
|
||||
"mac_turkish": ["mac_iceland", "mac_roman"],
|
||||
"ptcp154": ["cp1251", "kz1048"],
|
||||
"tis_620": ["iso8859_11"],
|
||||
}
|
||||
|
||||
|
||||
CHARDET_CORRESPONDENCE: Dict[str, str] = {
|
||||
"iso2022_kr": "ISO-2022-KR",
|
||||
"iso2022_jp": "ISO-2022-JP",
|
||||
"euc_kr": "EUC-KR",
|
||||
"tis_620": "TIS-620",
|
||||
"utf_32": "UTF-32",
|
||||
"euc_jp": "EUC-JP",
|
||||
"koi8_r": "KOI8-R",
|
||||
"iso8859_1": "ISO-8859-1",
|
||||
"iso8859_2": "ISO-8859-2",
|
||||
"iso8859_5": "ISO-8859-5",
|
||||
"iso8859_6": "ISO-8859-6",
|
||||
"iso8859_7": "ISO-8859-7",
|
||||
"iso8859_8": "ISO-8859-8",
|
||||
"utf_16": "UTF-16",
|
||||
"cp855": "IBM855",
|
||||
"mac_cyrillic": "MacCyrillic",
|
||||
"gb2312": "GB2312",
|
||||
"gb18030": "GB18030",
|
||||
"cp932": "CP932",
|
||||
"cp866": "IBM866",
|
||||
"utf_8": "utf-8",
|
||||
"utf_8_sig": "UTF-8-SIG",
|
||||
"shift_jis": "SHIFT_JIS",
|
||||
"big5": "Big5",
|
||||
"cp1250": "windows-1250",
|
||||
"cp1251": "windows-1251",
|
||||
"cp1252": "Windows-1252",
|
||||
"cp1253": "windows-1253",
|
||||
"cp1255": "windows-1255",
|
||||
"cp1256": "windows-1256",
|
||||
"cp1254": "Windows-1254",
|
||||
"cp949": "CP949",
|
||||
}
|
||||
|
||||
|
||||
COMMON_SAFE_ASCII_CHARACTERS: Set[str] = {
|
||||
"<",
|
||||
">",
|
||||
"=",
|
||||
":",
|
||||
"/",
|
||||
"&",
|
||||
";",
|
||||
"{",
|
||||
"}",
|
||||
"[",
|
||||
"]",
|
||||
",",
|
||||
"|",
|
||||
'"',
|
||||
"-",
|
||||
}
|
||||
|
||||
|
||||
KO_NAMES: Set[str] = {"johab", "cp949", "euc_kr"}
|
||||
ZH_NAMES: Set[str] = {"big5", "cp950", "big5hkscs", "hz"}
|
||||
|
||||
NOT_PRINTABLE_PATTERN = re_compile(r"[0-9\W\n\r\t]+")
|
||||
|
||||
LANGUAGE_SUPPORTED_COUNT: int = len(FREQUENCIES)
|
||||
|
||||
# Logging LEVEL bellow DEBUG
|
||||
TRACE: int = 5
|
||||
@ -0,0 +1,95 @@
|
||||
import warnings
|
||||
from typing import Dict, Optional, Union
|
||||
|
||||
from .api import from_bytes, from_fp, from_path, normalize
|
||||
from .constant import CHARDET_CORRESPONDENCE
|
||||
from .models import CharsetMatch, CharsetMatches
|
||||
|
||||
|
||||
def detect(byte_str: bytes) -> Dict[str, Optional[Union[str, float]]]:
|
||||
"""
|
||||
chardet legacy method
|
||||
Detect the encoding of the given byte string. It should be mostly backward-compatible.
|
||||
Encoding name will match Chardet own writing whenever possible. (Not on encoding name unsupported by it)
|
||||
This function is deprecated and should be used to migrate your project easily, consult the documentation for
|
||||
further information. Not planned for removal.
|
||||
|
||||
:param byte_str: The byte sequence to examine.
|
||||
"""
|
||||
if not isinstance(byte_str, (bytearray, bytes)):
|
||||
raise TypeError( # pragma: nocover
|
||||
"Expected object of type bytes or bytearray, got: "
|
||||
"{0}".format(type(byte_str))
|
||||
)
|
||||
|
||||
if isinstance(byte_str, bytearray):
|
||||
byte_str = bytes(byte_str)
|
||||
|
||||
r = from_bytes(byte_str).best()
|
||||
|
||||
encoding = r.encoding if r is not None else None
|
||||
language = r.language if r is not None and r.language != "Unknown" else ""
|
||||
confidence = 1.0 - r.chaos if r is not None else None
|
||||
|
||||
# Note: CharsetNormalizer does not return 'UTF-8-SIG' as the sig get stripped in the detection/normalization process
|
||||
# but chardet does return 'utf-8-sig' and it is a valid codec name.
|
||||
if r is not None and encoding == "utf_8" and r.bom:
|
||||
encoding += "_sig"
|
||||
|
||||
return {
|
||||
"encoding": encoding
|
||||
if encoding not in CHARDET_CORRESPONDENCE
|
||||
else CHARDET_CORRESPONDENCE[encoding],
|
||||
"language": language,
|
||||
"confidence": confidence,
|
||||
}
|
||||
|
||||
|
||||
class CharsetNormalizerMatch(CharsetMatch):
|
||||
pass
|
||||
|
||||
|
||||
class CharsetNormalizerMatches(CharsetMatches):
|
||||
@staticmethod
|
||||
def from_fp(*args, **kwargs): # type: ignore
|
||||
warnings.warn( # pragma: nocover
|
||||
"staticmethod from_fp, from_bytes, from_path and normalize are deprecated "
|
||||
"and scheduled to be removed in 3.0",
|
||||
DeprecationWarning,
|
||||
)
|
||||
return from_fp(*args, **kwargs) # pragma: nocover
|
||||
|
||||
@staticmethod
|
||||
def from_bytes(*args, **kwargs): # type: ignore
|
||||
warnings.warn( # pragma: nocover
|
||||
"staticmethod from_fp, from_bytes, from_path and normalize are deprecated "
|
||||
"and scheduled to be removed in 3.0",
|
||||
DeprecationWarning,
|
||||
)
|
||||
return from_bytes(*args, **kwargs) # pragma: nocover
|
||||
|
||||
@staticmethod
|
||||
def from_path(*args, **kwargs): # type: ignore
|
||||
warnings.warn( # pragma: nocover
|
||||
"staticmethod from_fp, from_bytes, from_path and normalize are deprecated "
|
||||
"and scheduled to be removed in 3.0",
|
||||
DeprecationWarning,
|
||||
)
|
||||
return from_path(*args, **kwargs) # pragma: nocover
|
||||
|
||||
@staticmethod
|
||||
def normalize(*args, **kwargs): # type: ignore
|
||||
warnings.warn( # pragma: nocover
|
||||
"staticmethod from_fp, from_bytes, from_path and normalize are deprecated "
|
||||
"and scheduled to be removed in 3.0",
|
||||
DeprecationWarning,
|
||||
)
|
||||
return normalize(*args, **kwargs) # pragma: nocover
|
||||
|
||||
|
||||
class CharsetDetector(CharsetNormalizerMatches):
|
||||
pass
|
||||
|
||||
|
||||
class CharsetDoctor(CharsetNormalizerMatches):
|
||||
pass
|
||||
553
venv/lib/python3.11/site-packages/charset_normalizer/md.py
Normal file
553
venv/lib/python3.11/site-packages/charset_normalizer/md.py
Normal file
@ -0,0 +1,553 @@
|
||||
from functools import lru_cache
|
||||
from typing import List, Optional
|
||||
|
||||
from .constant import COMMON_SAFE_ASCII_CHARACTERS, UNICODE_SECONDARY_RANGE_KEYWORD
|
||||
from .utils import (
|
||||
is_accentuated,
|
||||
is_ascii,
|
||||
is_case_variable,
|
||||
is_cjk,
|
||||
is_emoticon,
|
||||
is_hangul,
|
||||
is_hiragana,
|
||||
is_katakana,
|
||||
is_latin,
|
||||
is_punctuation,
|
||||
is_separator,
|
||||
is_symbol,
|
||||
is_thai,
|
||||
is_unprintable,
|
||||
remove_accent,
|
||||
unicode_range,
|
||||
)
|
||||
|
||||
|
||||
class MessDetectorPlugin:
|
||||
"""
|
||||
Base abstract class used for mess detection plugins.
|
||||
All detectors MUST extend and implement given methods.
|
||||
"""
|
||||
|
||||
def eligible(self, character: str) -> bool:
|
||||
"""
|
||||
Determine if given character should be fed in.
|
||||
"""
|
||||
raise NotImplementedError # pragma: nocover
|
||||
|
||||
def feed(self, character: str) -> None:
|
||||
"""
|
||||
The main routine to be executed upon character.
|
||||
Insert the logic in witch the text would be considered chaotic.
|
||||
"""
|
||||
raise NotImplementedError # pragma: nocover
|
||||
|
||||
def reset(self) -> None: # pragma: no cover
|
||||
"""
|
||||
Permit to reset the plugin to the initial state.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@property
|
||||
def ratio(self) -> float:
|
||||
"""
|
||||
Compute the chaos ratio based on what your feed() has seen.
|
||||
Must NOT be lower than 0.; No restriction gt 0.
|
||||
"""
|
||||
raise NotImplementedError # pragma: nocover
|
||||
|
||||
|
||||
class TooManySymbolOrPunctuationPlugin(MessDetectorPlugin):
|
||||
def __init__(self) -> None:
|
||||
self._punctuation_count: int = 0
|
||||
self._symbol_count: int = 0
|
||||
self._character_count: int = 0
|
||||
|
||||
self._last_printable_char: Optional[str] = None
|
||||
self._frenzy_symbol_in_word: bool = False
|
||||
|
||||
def eligible(self, character: str) -> bool:
|
||||
return character.isprintable()
|
||||
|
||||
def feed(self, character: str) -> None:
|
||||
self._character_count += 1
|
||||
|
||||
if (
|
||||
character != self._last_printable_char
|
||||
and character not in COMMON_SAFE_ASCII_CHARACTERS
|
||||
):
|
||||
if is_punctuation(character):
|
||||
self._punctuation_count += 1
|
||||
elif (
|
||||
character.isdigit() is False
|
||||
and is_symbol(character)
|
||||
and is_emoticon(character) is False
|
||||
):
|
||||
self._symbol_count += 2
|
||||
|
||||
self._last_printable_char = character
|
||||
|
||||
def reset(self) -> None: # pragma: no cover
|
||||
self._punctuation_count = 0
|
||||
self._character_count = 0
|
||||
self._symbol_count = 0
|
||||
|
||||
@property
|
||||
def ratio(self) -> float:
|
||||
if self._character_count == 0:
|
||||
return 0.0
|
||||
|
||||
ratio_of_punctuation: float = (
|
||||
self._punctuation_count + self._symbol_count
|
||||
) / self._character_count
|
||||
|
||||
return ratio_of_punctuation if ratio_of_punctuation >= 0.3 else 0.0
|
||||
|
||||
|
||||
class TooManyAccentuatedPlugin(MessDetectorPlugin):
|
||||
def __init__(self) -> None:
|
||||
self._character_count: int = 0
|
||||
self._accentuated_count: int = 0
|
||||
|
||||
def eligible(self, character: str) -> bool:
|
||||
return character.isalpha()
|
||||
|
||||
def feed(self, character: str) -> None:
|
||||
self._character_count += 1
|
||||
|
||||
if is_accentuated(character):
|
||||
self._accentuated_count += 1
|
||||
|
||||
def reset(self) -> None: # pragma: no cover
|
||||
self._character_count = 0
|
||||
self._accentuated_count = 0
|
||||
|
||||
@property
|
||||
def ratio(self) -> float:
|
||||
if self._character_count == 0:
|
||||
return 0.0
|
||||
ratio_of_accentuation: float = self._accentuated_count / self._character_count
|
||||
return ratio_of_accentuation if ratio_of_accentuation >= 0.35 else 0.0
|
||||
|
||||
|
||||
class UnprintablePlugin(MessDetectorPlugin):
|
||||
def __init__(self) -> None:
|
||||
self._unprintable_count: int = 0
|
||||
self._character_count: int = 0
|
||||
|
||||
def eligible(self, character: str) -> bool:
|
||||
return True
|
||||
|
||||
def feed(self, character: str) -> None:
|
||||
if is_unprintable(character):
|
||||
self._unprintable_count += 1
|
||||
self._character_count += 1
|
||||
|
||||
def reset(self) -> None: # pragma: no cover
|
||||
self._unprintable_count = 0
|
||||
|
||||
@property
|
||||
def ratio(self) -> float:
|
||||
if self._character_count == 0:
|
||||
return 0.0
|
||||
|
||||
return (self._unprintable_count * 8) / self._character_count
|
||||
|
||||
|
||||
class SuspiciousDuplicateAccentPlugin(MessDetectorPlugin):
|
||||
def __init__(self) -> None:
|
||||
self._successive_count: int = 0
|
||||
self._character_count: int = 0
|
||||
|
||||
self._last_latin_character: Optional[str] = None
|
||||
|
||||
def eligible(self, character: str) -> bool:
|
||||
return character.isalpha() and is_latin(character)
|
||||
|
||||
def feed(self, character: str) -> None:
|
||||
self._character_count += 1
|
||||
if (
|
||||
self._last_latin_character is not None
|
||||
and is_accentuated(character)
|
||||
and is_accentuated(self._last_latin_character)
|
||||
):
|
||||
if character.isupper() and self._last_latin_character.isupper():
|
||||
self._successive_count += 1
|
||||
# Worse if its the same char duplicated with different accent.
|
||||
if remove_accent(character) == remove_accent(self._last_latin_character):
|
||||
self._successive_count += 1
|
||||
self._last_latin_character = character
|
||||
|
||||
def reset(self) -> None: # pragma: no cover
|
||||
self._successive_count = 0
|
||||
self._character_count = 0
|
||||
self._last_latin_character = None
|
||||
|
||||
@property
|
||||
def ratio(self) -> float:
|
||||
if self._character_count == 0:
|
||||
return 0.0
|
||||
|
||||
return (self._successive_count * 2) / self._character_count
|
||||
|
||||
|
||||
class SuspiciousRange(MessDetectorPlugin):
|
||||
def __init__(self) -> None:
|
||||
self._suspicious_successive_range_count: int = 0
|
||||
self._character_count: int = 0
|
||||
self._last_printable_seen: Optional[str] = None
|
||||
|
||||
def eligible(self, character: str) -> bool:
|
||||
return character.isprintable()
|
||||
|
||||
def feed(self, character: str) -> None:
|
||||
self._character_count += 1
|
||||
|
||||
if (
|
||||
character.isspace()
|
||||
or is_punctuation(character)
|
||||
or character in COMMON_SAFE_ASCII_CHARACTERS
|
||||
):
|
||||
self._last_printable_seen = None
|
||||
return
|
||||
|
||||
if self._last_printable_seen is None:
|
||||
self._last_printable_seen = character
|
||||
return
|
||||
|
||||
unicode_range_a: Optional[str] = unicode_range(self._last_printable_seen)
|
||||
unicode_range_b: Optional[str] = unicode_range(character)
|
||||
|
||||
if is_suspiciously_successive_range(unicode_range_a, unicode_range_b):
|
||||
self._suspicious_successive_range_count += 1
|
||||
|
||||
self._last_printable_seen = character
|
||||
|
||||
def reset(self) -> None: # pragma: no cover
|
||||
self._character_count = 0
|
||||
self._suspicious_successive_range_count = 0
|
||||
self._last_printable_seen = None
|
||||
|
||||
@property
|
||||
def ratio(self) -> float:
|
||||
if self._character_count == 0:
|
||||
return 0.0
|
||||
|
||||
ratio_of_suspicious_range_usage: float = (
|
||||
self._suspicious_successive_range_count * 2
|
||||
) / self._character_count
|
||||
|
||||
if ratio_of_suspicious_range_usage < 0.1:
|
||||
return 0.0
|
||||
|
||||
return ratio_of_suspicious_range_usage
|
||||
|
||||
|
||||
class SuperWeirdWordPlugin(MessDetectorPlugin):
|
||||
def __init__(self) -> None:
|
||||
self._word_count: int = 0
|
||||
self._bad_word_count: int = 0
|
||||
self._foreign_long_count: int = 0
|
||||
|
||||
self._is_current_word_bad: bool = False
|
||||
self._foreign_long_watch: bool = False
|
||||
|
||||
self._character_count: int = 0
|
||||
self._bad_character_count: int = 0
|
||||
|
||||
self._buffer: str = ""
|
||||
self._buffer_accent_count: int = 0
|
||||
|
||||
def eligible(self, character: str) -> bool:
|
||||
return True
|
||||
|
||||
def feed(self, character: str) -> None:
|
||||
if character.isalpha():
|
||||
self._buffer += character
|
||||
if is_accentuated(character):
|
||||
self._buffer_accent_count += 1
|
||||
if (
|
||||
self._foreign_long_watch is False
|
||||
and (is_latin(character) is False or is_accentuated(character))
|
||||
and is_cjk(character) is False
|
||||
and is_hangul(character) is False
|
||||
and is_katakana(character) is False
|
||||
and is_hiragana(character) is False
|
||||
and is_thai(character) is False
|
||||
):
|
||||
self._foreign_long_watch = True
|
||||
return
|
||||
if not self._buffer:
|
||||
return
|
||||
if (
|
||||
character.isspace() or is_punctuation(character) or is_separator(character)
|
||||
) and self._buffer:
|
||||
self._word_count += 1
|
||||
buffer_length: int = len(self._buffer)
|
||||
|
||||
self._character_count += buffer_length
|
||||
|
||||
if buffer_length >= 4:
|
||||
if self._buffer_accent_count / buffer_length > 0.34:
|
||||
self._is_current_word_bad = True
|
||||
# Word/Buffer ending with a upper case accentuated letter are so rare,
|
||||
# that we will consider them all as suspicious. Same weight as foreign_long suspicious.
|
||||
if is_accentuated(self._buffer[-1]) and self._buffer[-1].isupper():
|
||||
self._foreign_long_count += 1
|
||||
self._is_current_word_bad = True
|
||||
if buffer_length >= 24 and self._foreign_long_watch:
|
||||
self._foreign_long_count += 1
|
||||
self._is_current_word_bad = True
|
||||
|
||||
if self._is_current_word_bad:
|
||||
self._bad_word_count += 1
|
||||
self._bad_character_count += len(self._buffer)
|
||||
self._is_current_word_bad = False
|
||||
|
||||
self._foreign_long_watch = False
|
||||
self._buffer = ""
|
||||
self._buffer_accent_count = 0
|
||||
elif (
|
||||
character not in {"<", ">", "-", "=", "~", "|", "_"}
|
||||
and character.isdigit() is False
|
||||
and is_symbol(character)
|
||||
):
|
||||
self._is_current_word_bad = True
|
||||
self._buffer += character
|
||||
|
||||
def reset(self) -> None: # pragma: no cover
|
||||
self._buffer = ""
|
||||
self._is_current_word_bad = False
|
||||
self._foreign_long_watch = False
|
||||
self._bad_word_count = 0
|
||||
self._word_count = 0
|
||||
self._character_count = 0
|
||||
self._bad_character_count = 0
|
||||
self._foreign_long_count = 0
|
||||
|
||||
@property
|
||||
def ratio(self) -> float:
|
||||
if self._word_count <= 10 and self._foreign_long_count == 0:
|
||||
return 0.0
|
||||
|
||||
return self._bad_character_count / self._character_count
|
||||
|
||||
|
||||
class CjkInvalidStopPlugin(MessDetectorPlugin):
|
||||
"""
|
||||
GB(Chinese) based encoding often render the stop incorrectly when the content does not fit and
|
||||
can be easily detected. Searching for the overuse of '丅' and '丄'.
|
||||
"""
|
||||
|
||||
def __init__(self) -> None:
|
||||
self._wrong_stop_count: int = 0
|
||||
self._cjk_character_count: int = 0
|
||||
|
||||
def eligible(self, character: str) -> bool:
|
||||
return True
|
||||
|
||||
def feed(self, character: str) -> None:
|
||||
if character in {"丅", "丄"}:
|
||||
self._wrong_stop_count += 1
|
||||
return
|
||||
if is_cjk(character):
|
||||
self._cjk_character_count += 1
|
||||
|
||||
def reset(self) -> None: # pragma: no cover
|
||||
self._wrong_stop_count = 0
|
||||
self._cjk_character_count = 0
|
||||
|
||||
@property
|
||||
def ratio(self) -> float:
|
||||
if self._cjk_character_count < 16:
|
||||
return 0.0
|
||||
return self._wrong_stop_count / self._cjk_character_count
|
||||
|
||||
|
||||
class ArchaicUpperLowerPlugin(MessDetectorPlugin):
|
||||
def __init__(self) -> None:
|
||||
self._buf: bool = False
|
||||
|
||||
self._character_count_since_last_sep: int = 0
|
||||
|
||||
self._successive_upper_lower_count: int = 0
|
||||
self._successive_upper_lower_count_final: int = 0
|
||||
|
||||
self._character_count: int = 0
|
||||
|
||||
self._last_alpha_seen: Optional[str] = None
|
||||
self._current_ascii_only: bool = True
|
||||
|
||||
def eligible(self, character: str) -> bool:
|
||||
return True
|
||||
|
||||
def feed(self, character: str) -> None:
|
||||
is_concerned = character.isalpha() and is_case_variable(character)
|
||||
chunk_sep = is_concerned is False
|
||||
|
||||
if chunk_sep and self._character_count_since_last_sep > 0:
|
||||
if (
|
||||
self._character_count_since_last_sep <= 64
|
||||
and character.isdigit() is False
|
||||
and self._current_ascii_only is False
|
||||
):
|
||||
self._successive_upper_lower_count_final += (
|
||||
self._successive_upper_lower_count
|
||||
)
|
||||
|
||||
self._successive_upper_lower_count = 0
|
||||
self._character_count_since_last_sep = 0
|
||||
self._last_alpha_seen = None
|
||||
self._buf = False
|
||||
self._character_count += 1
|
||||
self._current_ascii_only = True
|
||||
|
||||
return
|
||||
|
||||
if self._current_ascii_only is True and is_ascii(character) is False:
|
||||
self._current_ascii_only = False
|
||||
|
||||
if self._last_alpha_seen is not None:
|
||||
if (character.isupper() and self._last_alpha_seen.islower()) or (
|
||||
character.islower() and self._last_alpha_seen.isupper()
|
||||
):
|
||||
if self._buf is True:
|
||||
self._successive_upper_lower_count += 2
|
||||
self._buf = False
|
||||
else:
|
||||
self._buf = True
|
||||
else:
|
||||
self._buf = False
|
||||
|
||||
self._character_count += 1
|
||||
self._character_count_since_last_sep += 1
|
||||
self._last_alpha_seen = character
|
||||
|
||||
def reset(self) -> None: # pragma: no cover
|
||||
self._character_count = 0
|
||||
self._character_count_since_last_sep = 0
|
||||
self._successive_upper_lower_count = 0
|
||||
self._successive_upper_lower_count_final = 0
|
||||
self._last_alpha_seen = None
|
||||
self._buf = False
|
||||
self._current_ascii_only = True
|
||||
|
||||
@property
|
||||
def ratio(self) -> float:
|
||||
if self._character_count == 0:
|
||||
return 0.0
|
||||
|
||||
return self._successive_upper_lower_count_final / self._character_count
|
||||
|
||||
|
||||
@lru_cache(maxsize=1024)
|
||||
def is_suspiciously_successive_range(
|
||||
unicode_range_a: Optional[str], unicode_range_b: Optional[str]
|
||||
) -> bool:
|
||||
"""
|
||||
Determine if two Unicode range seen next to each other can be considered as suspicious.
|
||||
"""
|
||||
if unicode_range_a is None or unicode_range_b is None:
|
||||
return True
|
||||
|
||||
if unicode_range_a == unicode_range_b:
|
||||
return False
|
||||
|
||||
if "Latin" in unicode_range_a and "Latin" in unicode_range_b:
|
||||
return False
|
||||
|
||||
if "Emoticons" in unicode_range_a or "Emoticons" in unicode_range_b:
|
||||
return False
|
||||
|
||||
# Latin characters can be accompanied with a combining diacritical mark
|
||||
# eg. Vietnamese.
|
||||
if ("Latin" in unicode_range_a or "Latin" in unicode_range_b) and (
|
||||
"Combining" in unicode_range_a or "Combining" in unicode_range_b
|
||||
):
|
||||
return False
|
||||
|
||||
keywords_range_a, keywords_range_b = unicode_range_a.split(
|
||||
" "
|
||||
), unicode_range_b.split(" ")
|
||||
|
||||
for el in keywords_range_a:
|
||||
if el in UNICODE_SECONDARY_RANGE_KEYWORD:
|
||||
continue
|
||||
if el in keywords_range_b:
|
||||
return False
|
||||
|
||||
# Japanese Exception
|
||||
range_a_jp_chars, range_b_jp_chars = (
|
||||
unicode_range_a
|
||||
in (
|
||||
"Hiragana",
|
||||
"Katakana",
|
||||
),
|
||||
unicode_range_b in ("Hiragana", "Katakana"),
|
||||
)
|
||||
if (range_a_jp_chars or range_b_jp_chars) and (
|
||||
"CJK" in unicode_range_a or "CJK" in unicode_range_b
|
||||
):
|
||||
return False
|
||||
if range_a_jp_chars and range_b_jp_chars:
|
||||
return False
|
||||
|
||||
if "Hangul" in unicode_range_a or "Hangul" in unicode_range_b:
|
||||
if "CJK" in unicode_range_a or "CJK" in unicode_range_b:
|
||||
return False
|
||||
if unicode_range_a == "Basic Latin" or unicode_range_b == "Basic Latin":
|
||||
return False
|
||||
|
||||
# Chinese/Japanese use dedicated range for punctuation and/or separators.
|
||||
if ("CJK" in unicode_range_a or "CJK" in unicode_range_b) or (
|
||||
unicode_range_a in ["Katakana", "Hiragana"]
|
||||
and unicode_range_b in ["Katakana", "Hiragana"]
|
||||
):
|
||||
if "Punctuation" in unicode_range_a or "Punctuation" in unicode_range_b:
|
||||
return False
|
||||
if "Forms" in unicode_range_a or "Forms" in unicode_range_b:
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
|
||||
@lru_cache(maxsize=2048)
|
||||
def mess_ratio(
|
||||
decoded_sequence: str, maximum_threshold: float = 0.2, debug: bool = False
|
||||
) -> float:
|
||||
"""
|
||||
Compute a mess ratio given a decoded bytes sequence. The maximum threshold does stop the computation earlier.
|
||||
"""
|
||||
|
||||
detectors: List[MessDetectorPlugin] = [
|
||||
md_class() for md_class in MessDetectorPlugin.__subclasses__()
|
||||
]
|
||||
|
||||
length: int = len(decoded_sequence) + 1
|
||||
|
||||
mean_mess_ratio: float = 0.0
|
||||
|
||||
if length < 512:
|
||||
intermediary_mean_mess_ratio_calc: int = 32
|
||||
elif length <= 1024:
|
||||
intermediary_mean_mess_ratio_calc = 64
|
||||
else:
|
||||
intermediary_mean_mess_ratio_calc = 128
|
||||
|
||||
for character, index in zip(decoded_sequence + "\n", range(length)):
|
||||
for detector in detectors:
|
||||
if detector.eligible(character):
|
||||
detector.feed(character)
|
||||
|
||||
if (
|
||||
index > 0 and index % intermediary_mean_mess_ratio_calc == 0
|
||||
) or index == length - 1:
|
||||
mean_mess_ratio = sum(dt.ratio for dt in detectors)
|
||||
|
||||
if mean_mess_ratio >= maximum_threshold:
|
||||
break
|
||||
|
||||
if debug:
|
||||
for dt in detectors: # pragma: nocover
|
||||
print(dt.__class__, dt.ratio)
|
||||
|
||||
return round(mean_mess_ratio, 3)
|
||||
401
venv/lib/python3.11/site-packages/charset_normalizer/models.py
Normal file
401
venv/lib/python3.11/site-packages/charset_normalizer/models.py
Normal file
@ -0,0 +1,401 @@
|
||||
import warnings
|
||||
from collections import Counter
|
||||
from encodings.aliases import aliases
|
||||
from hashlib import sha256
|
||||
from json import dumps
|
||||
from re import sub
|
||||
from typing import (
|
||||
Any,
|
||||
Counter as TypeCounter,
|
||||
Dict,
|
||||
Iterator,
|
||||
List,
|
||||
Optional,
|
||||
Tuple,
|
||||
Union,
|
||||
)
|
||||
|
||||
from .constant import NOT_PRINTABLE_PATTERN, TOO_BIG_SEQUENCE
|
||||
from .md import mess_ratio
|
||||
from .utils import iana_name, is_multi_byte_encoding, unicode_range
|
||||
|
||||
|
||||
class CharsetMatch:
|
||||
def __init__(
|
||||
self,
|
||||
payload: bytes,
|
||||
guessed_encoding: str,
|
||||
mean_mess_ratio: float,
|
||||
has_sig_or_bom: bool,
|
||||
languages: "CoherenceMatches",
|
||||
decoded_payload: Optional[str] = None,
|
||||
):
|
||||
self._payload: bytes = payload
|
||||
|
||||
self._encoding: str = guessed_encoding
|
||||
self._mean_mess_ratio: float = mean_mess_ratio
|
||||
self._languages: CoherenceMatches = languages
|
||||
self._has_sig_or_bom: bool = has_sig_or_bom
|
||||
self._unicode_ranges: Optional[List[str]] = None
|
||||
|
||||
self._leaves: List[CharsetMatch] = []
|
||||
self._mean_coherence_ratio: float = 0.0
|
||||
|
||||
self._output_payload: Optional[bytes] = None
|
||||
self._output_encoding: Optional[str] = None
|
||||
|
||||
self._string: Optional[str] = decoded_payload
|
||||
|
||||
def __eq__(self, other: object) -> bool:
|
||||
if not isinstance(other, CharsetMatch):
|
||||
raise TypeError(
|
||||
"__eq__ cannot be invoked on {} and {}.".format(
|
||||
str(other.__class__), str(self.__class__)
|
||||
)
|
||||
)
|
||||
return self.encoding == other.encoding and self.fingerprint == other.fingerprint
|
||||
|
||||
def __lt__(self, other: object) -> bool:
|
||||
"""
|
||||
Implemented to make sorted available upon CharsetMatches items.
|
||||
"""
|
||||
if not isinstance(other, CharsetMatch):
|
||||
raise ValueError
|
||||
|
||||
chaos_difference: float = abs(self.chaos - other.chaos)
|
||||
coherence_difference: float = abs(self.coherence - other.coherence)
|
||||
|
||||
# Bellow 1% difference --> Use Coherence
|
||||
if chaos_difference < 0.01 and coherence_difference > 0.02:
|
||||
# When having a tough decision, use the result that decoded as many multi-byte as possible.
|
||||
if chaos_difference == 0.0 and self.coherence == other.coherence:
|
||||
return self.multi_byte_usage > other.multi_byte_usage
|
||||
return self.coherence > other.coherence
|
||||
|
||||
return self.chaos < other.chaos
|
||||
|
||||
@property
|
||||
def multi_byte_usage(self) -> float:
|
||||
return 1.0 - len(str(self)) / len(self.raw)
|
||||
|
||||
@property
|
||||
def chaos_secondary_pass(self) -> float:
|
||||
"""
|
||||
Check once again chaos in decoded text, except this time, with full content.
|
||||
Use with caution, this can be very slow.
|
||||
Notice: Will be removed in 3.0
|
||||
"""
|
||||
warnings.warn(
|
||||
"chaos_secondary_pass is deprecated and will be removed in 3.0",
|
||||
DeprecationWarning,
|
||||
)
|
||||
return mess_ratio(str(self), 1.0)
|
||||
|
||||
@property
|
||||
def coherence_non_latin(self) -> float:
|
||||
"""
|
||||
Coherence ratio on the first non-latin language detected if ANY.
|
||||
Notice: Will be removed in 3.0
|
||||
"""
|
||||
warnings.warn(
|
||||
"coherence_non_latin is deprecated and will be removed in 3.0",
|
||||
DeprecationWarning,
|
||||
)
|
||||
return 0.0
|
||||
|
||||
@property
|
||||
def w_counter(self) -> TypeCounter[str]:
|
||||
"""
|
||||
Word counter instance on decoded text.
|
||||
Notice: Will be removed in 3.0
|
||||
"""
|
||||
warnings.warn(
|
||||
"w_counter is deprecated and will be removed in 3.0", DeprecationWarning
|
||||
)
|
||||
|
||||
string_printable_only = sub(NOT_PRINTABLE_PATTERN, " ", str(self).lower())
|
||||
|
||||
return Counter(string_printable_only.split())
|
||||
|
||||
def __str__(self) -> str:
|
||||
# Lazy Str Loading
|
||||
if self._string is None:
|
||||
self._string = str(self._payload, self._encoding, "strict")
|
||||
return self._string
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return "<CharsetMatch '{}' bytes({})>".format(self.encoding, self.fingerprint)
|
||||
|
||||
def add_submatch(self, other: "CharsetMatch") -> None:
|
||||
if not isinstance(other, CharsetMatch) or other == self:
|
||||
raise ValueError(
|
||||
"Unable to add instance <{}> as a submatch of a CharsetMatch".format(
|
||||
other.__class__
|
||||
)
|
||||
)
|
||||
|
||||
other._string = None # Unload RAM usage; dirty trick.
|
||||
self._leaves.append(other)
|
||||
|
||||
@property
|
||||
def encoding(self) -> str:
|
||||
return self._encoding
|
||||
|
||||
@property
|
||||
def encoding_aliases(self) -> List[str]:
|
||||
"""
|
||||
Encoding name are known by many name, using this could help when searching for IBM855 when it's listed as CP855.
|
||||
"""
|
||||
also_known_as: List[str] = []
|
||||
for u, p in aliases.items():
|
||||
if self.encoding == u:
|
||||
also_known_as.append(p)
|
||||
elif self.encoding == p:
|
||||
also_known_as.append(u)
|
||||
return also_known_as
|
||||
|
||||
@property
|
||||
def bom(self) -> bool:
|
||||
return self._has_sig_or_bom
|
||||
|
||||
@property
|
||||
def byte_order_mark(self) -> bool:
|
||||
return self._has_sig_or_bom
|
||||
|
||||
@property
|
||||
def languages(self) -> List[str]:
|
||||
"""
|
||||
Return the complete list of possible languages found in decoded sequence.
|
||||
Usually not really useful. Returned list may be empty even if 'language' property return something != 'Unknown'.
|
||||
"""
|
||||
return [e[0] for e in self._languages]
|
||||
|
||||
@property
|
||||
def language(self) -> str:
|
||||
"""
|
||||
Most probable language found in decoded sequence. If none were detected or inferred, the property will return
|
||||
"Unknown".
|
||||
"""
|
||||
if not self._languages:
|
||||
# Trying to infer the language based on the given encoding
|
||||
# Its either English or we should not pronounce ourselves in certain cases.
|
||||
if "ascii" in self.could_be_from_charset:
|
||||
return "English"
|
||||
|
||||
# doing it there to avoid circular import
|
||||
from charset_normalizer.cd import encoding_languages, mb_encoding_languages
|
||||
|
||||
languages = (
|
||||
mb_encoding_languages(self.encoding)
|
||||
if is_multi_byte_encoding(self.encoding)
|
||||
else encoding_languages(self.encoding)
|
||||
)
|
||||
|
||||
if len(languages) == 0 or "Latin Based" in languages:
|
||||
return "Unknown"
|
||||
|
||||
return languages[0]
|
||||
|
||||
return self._languages[0][0]
|
||||
|
||||
@property
|
||||
def chaos(self) -> float:
|
||||
return self._mean_mess_ratio
|
||||
|
||||
@property
|
||||
def coherence(self) -> float:
|
||||
if not self._languages:
|
||||
return 0.0
|
||||
return self._languages[0][1]
|
||||
|
||||
@property
|
||||
def percent_chaos(self) -> float:
|
||||
return round(self.chaos * 100, ndigits=3)
|
||||
|
||||
@property
|
||||
def percent_coherence(self) -> float:
|
||||
return round(self.coherence * 100, ndigits=3)
|
||||
|
||||
@property
|
||||
def raw(self) -> bytes:
|
||||
"""
|
||||
Original untouched bytes.
|
||||
"""
|
||||
return self._payload
|
||||
|
||||
@property
|
||||
def submatch(self) -> List["CharsetMatch"]:
|
||||
return self._leaves
|
||||
|
||||
@property
|
||||
def has_submatch(self) -> bool:
|
||||
return len(self._leaves) > 0
|
||||
|
||||
@property
|
||||
def alphabets(self) -> List[str]:
|
||||
if self._unicode_ranges is not None:
|
||||
return self._unicode_ranges
|
||||
# list detected ranges
|
||||
detected_ranges: List[Optional[str]] = [
|
||||
unicode_range(char) for char in str(self)
|
||||
]
|
||||
# filter and sort
|
||||
self._unicode_ranges = sorted(list({r for r in detected_ranges if r}))
|
||||
return self._unicode_ranges
|
||||
|
||||
@property
|
||||
def could_be_from_charset(self) -> List[str]:
|
||||
"""
|
||||
The complete list of encoding that output the exact SAME str result and therefore could be the originating
|
||||
encoding.
|
||||
This list does include the encoding available in property 'encoding'.
|
||||
"""
|
||||
return [self._encoding] + [m.encoding for m in self._leaves]
|
||||
|
||||
def first(self) -> "CharsetMatch":
|
||||
"""
|
||||
Kept for BC reasons. Will be removed in 3.0.
|
||||
"""
|
||||
return self
|
||||
|
||||
def best(self) -> "CharsetMatch":
|
||||
"""
|
||||
Kept for BC reasons. Will be removed in 3.0.
|
||||
"""
|
||||
return self
|
||||
|
||||
def output(self, encoding: str = "utf_8") -> bytes:
|
||||
"""
|
||||
Method to get re-encoded bytes payload using given target encoding. Default to UTF-8.
|
||||
Any errors will be simply ignored by the encoder NOT replaced.
|
||||
"""
|
||||
if self._output_encoding is None or self._output_encoding != encoding:
|
||||
self._output_encoding = encoding
|
||||
self._output_payload = str(self).encode(encoding, "replace")
|
||||
|
||||
return self._output_payload # type: ignore
|
||||
|
||||
@property
|
||||
def fingerprint(self) -> str:
|
||||
"""
|
||||
Retrieve the unique SHA256 computed using the transformed (re-encoded) payload. Not the original one.
|
||||
"""
|
||||
return sha256(self.output()).hexdigest()
|
||||
|
||||
|
||||
class CharsetMatches:
|
||||
"""
|
||||
Container with every CharsetMatch items ordered by default from most probable to the less one.
|
||||
Act like a list(iterable) but does not implements all related methods.
|
||||
"""
|
||||
|
||||
def __init__(self, results: Optional[List[CharsetMatch]] = None):
|
||||
self._results: List[CharsetMatch] = sorted(results) if results else []
|
||||
|
||||
def __iter__(self) -> Iterator[CharsetMatch]:
|
||||
yield from self._results
|
||||
|
||||
def __getitem__(self, item: Union[int, str]) -> CharsetMatch:
|
||||
"""
|
||||
Retrieve a single item either by its position or encoding name (alias may be used here).
|
||||
Raise KeyError upon invalid index or encoding not present in results.
|
||||
"""
|
||||
if isinstance(item, int):
|
||||
return self._results[item]
|
||||
if isinstance(item, str):
|
||||
item = iana_name(item, False)
|
||||
for result in self._results:
|
||||
if item in result.could_be_from_charset:
|
||||
return result
|
||||
raise KeyError
|
||||
|
||||
def __len__(self) -> int:
|
||||
return len(self._results)
|
||||
|
||||
def __bool__(self) -> bool:
|
||||
return len(self._results) > 0
|
||||
|
||||
def append(self, item: CharsetMatch) -> None:
|
||||
"""
|
||||
Insert a single match. Will be inserted accordingly to preserve sort.
|
||||
Can be inserted as a submatch.
|
||||
"""
|
||||
if not isinstance(item, CharsetMatch):
|
||||
raise ValueError(
|
||||
"Cannot append instance '{}' to CharsetMatches".format(
|
||||
str(item.__class__)
|
||||
)
|
||||
)
|
||||
# We should disable the submatch factoring when the input file is too heavy (conserve RAM usage)
|
||||
if len(item.raw) <= TOO_BIG_SEQUENCE:
|
||||
for match in self._results:
|
||||
if match.fingerprint == item.fingerprint and match.chaos == item.chaos:
|
||||
match.add_submatch(item)
|
||||
return
|
||||
self._results.append(item)
|
||||
self._results = sorted(self._results)
|
||||
|
||||
def best(self) -> Optional["CharsetMatch"]:
|
||||
"""
|
||||
Simply return the first match. Strict equivalent to matches[0].
|
||||
"""
|
||||
if not self._results:
|
||||
return None
|
||||
return self._results[0]
|
||||
|
||||
def first(self) -> Optional["CharsetMatch"]:
|
||||
"""
|
||||
Redundant method, call the method best(). Kept for BC reasons.
|
||||
"""
|
||||
return self.best()
|
||||
|
||||
|
||||
CoherenceMatch = Tuple[str, float]
|
||||
CoherenceMatches = List[CoherenceMatch]
|
||||
|
||||
|
||||
class CliDetectionResult:
|
||||
def __init__(
|
||||
self,
|
||||
path: str,
|
||||
encoding: Optional[str],
|
||||
encoding_aliases: List[str],
|
||||
alternative_encodings: List[str],
|
||||
language: str,
|
||||
alphabets: List[str],
|
||||
has_sig_or_bom: bool,
|
||||
chaos: float,
|
||||
coherence: float,
|
||||
unicode_path: Optional[str],
|
||||
is_preferred: bool,
|
||||
):
|
||||
self.path: str = path
|
||||
self.unicode_path: Optional[str] = unicode_path
|
||||
self.encoding: Optional[str] = encoding
|
||||
self.encoding_aliases: List[str] = encoding_aliases
|
||||
self.alternative_encodings: List[str] = alternative_encodings
|
||||
self.language: str = language
|
||||
self.alphabets: List[str] = alphabets
|
||||
self.has_sig_or_bom: bool = has_sig_or_bom
|
||||
self.chaos: float = chaos
|
||||
self.coherence: float = coherence
|
||||
self.is_preferred: bool = is_preferred
|
||||
|
||||
@property
|
||||
def __dict__(self) -> Dict[str, Any]: # type: ignore
|
||||
return {
|
||||
"path": self.path,
|
||||
"encoding": self.encoding,
|
||||
"encoding_aliases": self.encoding_aliases,
|
||||
"alternative_encodings": self.alternative_encodings,
|
||||
"language": self.language,
|
||||
"alphabets": self.alphabets,
|
||||
"has_sig_or_bom": self.has_sig_or_bom,
|
||||
"chaos": self.chaos,
|
||||
"coherence": self.coherence,
|
||||
"unicode_path": self.unicode_path,
|
||||
"is_preferred": self.is_preferred,
|
||||
}
|
||||
|
||||
def to_json(self) -> str:
|
||||
return dumps(self.__dict__, ensure_ascii=True, indent=4)
|
||||
424
venv/lib/python3.11/site-packages/charset_normalizer/utils.py
Normal file
424
venv/lib/python3.11/site-packages/charset_normalizer/utils.py
Normal file
@ -0,0 +1,424 @@
|
||||
try:
|
||||
# WARNING: unicodedata2 support is going to be removed in 3.0
|
||||
# Python is quickly catching up.
|
||||
import unicodedata2 as unicodedata
|
||||
except ImportError:
|
||||
import unicodedata # type: ignore[no-redef]
|
||||
|
||||
import importlib
|
||||
import logging
|
||||
from codecs import IncrementalDecoder
|
||||
from encodings.aliases import aliases
|
||||
from functools import lru_cache
|
||||
from re import findall
|
||||
from typing import Generator, List, Optional, Set, Tuple, Union
|
||||
|
||||
from _multibytecodec import MultibyteIncrementalDecoder
|
||||
|
||||
from .constant import (
|
||||
ENCODING_MARKS,
|
||||
IANA_SUPPORTED_SIMILAR,
|
||||
RE_POSSIBLE_ENCODING_INDICATION,
|
||||
UNICODE_RANGES_COMBINED,
|
||||
UNICODE_SECONDARY_RANGE_KEYWORD,
|
||||
UTF8_MAXIMAL_ALLOCATION,
|
||||
)
|
||||
|
||||
|
||||
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
||||
def is_accentuated(character: str) -> bool:
|
||||
try:
|
||||
description: str = unicodedata.name(character)
|
||||
except ValueError:
|
||||
return False
|
||||
return (
|
||||
"WITH GRAVE" in description
|
||||
or "WITH ACUTE" in description
|
||||
or "WITH CEDILLA" in description
|
||||
or "WITH DIAERESIS" in description
|
||||
or "WITH CIRCUMFLEX" in description
|
||||
or "WITH TILDE" in description
|
||||
)
|
||||
|
||||
|
||||
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
||||
def remove_accent(character: str) -> str:
|
||||
decomposed: str = unicodedata.decomposition(character)
|
||||
if not decomposed:
|
||||
return character
|
||||
|
||||
codes: List[str] = decomposed.split(" ")
|
||||
|
||||
return chr(int(codes[0], 16))
|
||||
|
||||
|
||||
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
||||
def unicode_range(character: str) -> Optional[str]:
|
||||
"""
|
||||
Retrieve the Unicode range official name from a single character.
|
||||
"""
|
||||
character_ord: int = ord(character)
|
||||
|
||||
for range_name, ord_range in UNICODE_RANGES_COMBINED.items():
|
||||
if character_ord in ord_range:
|
||||
return range_name
|
||||
|
||||
return None
|
||||
|
||||
|
||||
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
||||
def is_latin(character: str) -> bool:
|
||||
try:
|
||||
description: str = unicodedata.name(character)
|
||||
except ValueError:
|
||||
return False
|
||||
return "LATIN" in description
|
||||
|
||||
|
||||
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
||||
def is_ascii(character: str) -> bool:
|
||||
try:
|
||||
character.encode("ascii")
|
||||
except UnicodeEncodeError:
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
||||
def is_punctuation(character: str) -> bool:
|
||||
character_category: str = unicodedata.category(character)
|
||||
|
||||
if "P" in character_category:
|
||||
return True
|
||||
|
||||
character_range: Optional[str] = unicode_range(character)
|
||||
|
||||
if character_range is None:
|
||||
return False
|
||||
|
||||
return "Punctuation" in character_range
|
||||
|
||||
|
||||
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
||||
def is_symbol(character: str) -> bool:
|
||||
character_category: str = unicodedata.category(character)
|
||||
|
||||
if "S" in character_category or "N" in character_category:
|
||||
return True
|
||||
|
||||
character_range: Optional[str] = unicode_range(character)
|
||||
|
||||
if character_range is None:
|
||||
return False
|
||||
|
||||
return "Forms" in character_range
|
||||
|
||||
|
||||
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
||||
def is_emoticon(character: str) -> bool:
|
||||
character_range: Optional[str] = unicode_range(character)
|
||||
|
||||
if character_range is None:
|
||||
return False
|
||||
|
||||
return "Emoticons" in character_range
|
||||
|
||||
|
||||
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
||||
def is_separator(character: str) -> bool:
|
||||
if character.isspace() or character in {"|", "+", ",", ";", "<", ">"}:
|
||||
return True
|
||||
|
||||
character_category: str = unicodedata.category(character)
|
||||
|
||||
return "Z" in character_category
|
||||
|
||||
|
||||
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
||||
def is_case_variable(character: str) -> bool:
|
||||
return character.islower() != character.isupper()
|
||||
|
||||
|
||||
def is_private_use_only(character: str) -> bool:
|
||||
character_category: str = unicodedata.category(character)
|
||||
|
||||
return character_category == "Co"
|
||||
|
||||
|
||||
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
||||
def is_cjk(character: str) -> bool:
|
||||
try:
|
||||
character_name = unicodedata.name(character)
|
||||
except ValueError:
|
||||
return False
|
||||
|
||||
return "CJK" in character_name
|
||||
|
||||
|
||||
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
||||
def is_hiragana(character: str) -> bool:
|
||||
try:
|
||||
character_name = unicodedata.name(character)
|
||||
except ValueError:
|
||||
return False
|
||||
|
||||
return "HIRAGANA" in character_name
|
||||
|
||||
|
||||
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
||||
def is_katakana(character: str) -> bool:
|
||||
try:
|
||||
character_name = unicodedata.name(character)
|
||||
except ValueError:
|
||||
return False
|
||||
|
||||
return "KATAKANA" in character_name
|
||||
|
||||
|
||||
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
||||
def is_hangul(character: str) -> bool:
|
||||
try:
|
||||
character_name = unicodedata.name(character)
|
||||
except ValueError:
|
||||
return False
|
||||
|
||||
return "HANGUL" in character_name
|
||||
|
||||
|
||||
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
||||
def is_thai(character: str) -> bool:
|
||||
try:
|
||||
character_name = unicodedata.name(character)
|
||||
except ValueError:
|
||||
return False
|
||||
|
||||
return "THAI" in character_name
|
||||
|
||||
|
||||
@lru_cache(maxsize=len(UNICODE_RANGES_COMBINED))
|
||||
def is_unicode_range_secondary(range_name: str) -> bool:
|
||||
return any(keyword in range_name for keyword in UNICODE_SECONDARY_RANGE_KEYWORD)
|
||||
|
||||
|
||||
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
||||
def is_unprintable(character: str) -> bool:
|
||||
return (
|
||||
character.isspace() is False # includes \n \t \r \v
|
||||
and character.isprintable() is False
|
||||
and character != "\x1A" # Why? Its the ASCII substitute character.
|
||||
and character != "\ufeff" # bug discovered in Python,
|
||||
# Zero Width No-Break Space located in Arabic Presentation Forms-B, Unicode 1.1 not acknowledged as space.
|
||||
)
|
||||
|
||||
|
||||
def any_specified_encoding(sequence: bytes, search_zone: int = 4096) -> Optional[str]:
|
||||
"""
|
||||
Extract using ASCII-only decoder any specified encoding in the first n-bytes.
|
||||
"""
|
||||
if not isinstance(sequence, bytes):
|
||||
raise TypeError
|
||||
|
||||
seq_len: int = len(sequence)
|
||||
|
||||
results: List[str] = findall(
|
||||
RE_POSSIBLE_ENCODING_INDICATION,
|
||||
sequence[: min(seq_len, search_zone)].decode("ascii", errors="ignore"),
|
||||
)
|
||||
|
||||
if len(results) == 0:
|
||||
return None
|
||||
|
||||
for specified_encoding in results:
|
||||
specified_encoding = specified_encoding.lower().replace("-", "_")
|
||||
|
||||
encoding_alias: str
|
||||
encoding_iana: str
|
||||
|
||||
for encoding_alias, encoding_iana in aliases.items():
|
||||
if encoding_alias == specified_encoding:
|
||||
return encoding_iana
|
||||
if encoding_iana == specified_encoding:
|
||||
return encoding_iana
|
||||
|
||||
return None
|
||||
|
||||
|
||||
@lru_cache(maxsize=128)
|
||||
def is_multi_byte_encoding(name: str) -> bool:
|
||||
"""
|
||||
Verify is a specific encoding is a multi byte one based on it IANA name
|
||||
"""
|
||||
return name in {
|
||||
"utf_8",
|
||||
"utf_8_sig",
|
||||
"utf_16",
|
||||
"utf_16_be",
|
||||
"utf_16_le",
|
||||
"utf_32",
|
||||
"utf_32_le",
|
||||
"utf_32_be",
|
||||
"utf_7",
|
||||
} or issubclass(
|
||||
importlib.import_module("encodings.{}".format(name)).IncrementalDecoder,
|
||||
MultibyteIncrementalDecoder,
|
||||
)
|
||||
|
||||
|
||||
def identify_sig_or_bom(sequence: bytes) -> Tuple[Optional[str], bytes]:
|
||||
"""
|
||||
Identify and extract SIG/BOM in given sequence.
|
||||
"""
|
||||
|
||||
for iana_encoding in ENCODING_MARKS:
|
||||
marks: Union[bytes, List[bytes]] = ENCODING_MARKS[iana_encoding]
|
||||
|
||||
if isinstance(marks, bytes):
|
||||
marks = [marks]
|
||||
|
||||
for mark in marks:
|
||||
if sequence.startswith(mark):
|
||||
return iana_encoding, mark
|
||||
|
||||
return None, b""
|
||||
|
||||
|
||||
def should_strip_sig_or_bom(iana_encoding: str) -> bool:
|
||||
return iana_encoding not in {"utf_16", "utf_32"}
|
||||
|
||||
|
||||
def iana_name(cp_name: str, strict: bool = True) -> str:
|
||||
cp_name = cp_name.lower().replace("-", "_")
|
||||
|
||||
encoding_alias: str
|
||||
encoding_iana: str
|
||||
|
||||
for encoding_alias, encoding_iana in aliases.items():
|
||||
if cp_name in [encoding_alias, encoding_iana]:
|
||||
return encoding_iana
|
||||
|
||||
if strict:
|
||||
raise ValueError("Unable to retrieve IANA for '{}'".format(cp_name))
|
||||
|
||||
return cp_name
|
||||
|
||||
|
||||
def range_scan(decoded_sequence: str) -> List[str]:
|
||||
ranges: Set[str] = set()
|
||||
|
||||
for character in decoded_sequence:
|
||||
character_range: Optional[str] = unicode_range(character)
|
||||
|
||||
if character_range is None:
|
||||
continue
|
||||
|
||||
ranges.add(character_range)
|
||||
|
||||
return list(ranges)
|
||||
|
||||
|
||||
def cp_similarity(iana_name_a: str, iana_name_b: str) -> float:
|
||||
|
||||
if is_multi_byte_encoding(iana_name_a) or is_multi_byte_encoding(iana_name_b):
|
||||
return 0.0
|
||||
|
||||
decoder_a = importlib.import_module(
|
||||
"encodings.{}".format(iana_name_a)
|
||||
).IncrementalDecoder
|
||||
decoder_b = importlib.import_module(
|
||||
"encodings.{}".format(iana_name_b)
|
||||
).IncrementalDecoder
|
||||
|
||||
id_a: IncrementalDecoder = decoder_a(errors="ignore")
|
||||
id_b: IncrementalDecoder = decoder_b(errors="ignore")
|
||||
|
||||
character_match_count: int = 0
|
||||
|
||||
for i in range(255):
|
||||
to_be_decoded: bytes = bytes([i])
|
||||
if id_a.decode(to_be_decoded) == id_b.decode(to_be_decoded):
|
||||
character_match_count += 1
|
||||
|
||||
return character_match_count / 254
|
||||
|
||||
|
||||
def is_cp_similar(iana_name_a: str, iana_name_b: str) -> bool:
|
||||
"""
|
||||
Determine if two code page are at least 80% similar. IANA_SUPPORTED_SIMILAR dict was generated using
|
||||
the function cp_similarity.
|
||||
"""
|
||||
return (
|
||||
iana_name_a in IANA_SUPPORTED_SIMILAR
|
||||
and iana_name_b in IANA_SUPPORTED_SIMILAR[iana_name_a]
|
||||
)
|
||||
|
||||
|
||||
def set_logging_handler(
|
||||
name: str = "charset_normalizer",
|
||||
level: int = logging.INFO,
|
||||
format_string: str = "%(asctime)s | %(levelname)s | %(message)s",
|
||||
) -> None:
|
||||
|
||||
logger = logging.getLogger(name)
|
||||
logger.setLevel(level)
|
||||
|
||||
handler = logging.StreamHandler()
|
||||
handler.setFormatter(logging.Formatter(format_string))
|
||||
logger.addHandler(handler)
|
||||
|
||||
|
||||
def cut_sequence_chunks(
|
||||
sequences: bytes,
|
||||
encoding_iana: str,
|
||||
offsets: range,
|
||||
chunk_size: int,
|
||||
bom_or_sig_available: bool,
|
||||
strip_sig_or_bom: bool,
|
||||
sig_payload: bytes,
|
||||
is_multi_byte_decoder: bool,
|
||||
decoded_payload: Optional[str] = None,
|
||||
) -> Generator[str, None, None]:
|
||||
|
||||
if decoded_payload and is_multi_byte_decoder is False:
|
||||
for i in offsets:
|
||||
chunk = decoded_payload[i : i + chunk_size]
|
||||
if not chunk:
|
||||
break
|
||||
yield chunk
|
||||
else:
|
||||
for i in offsets:
|
||||
chunk_end = i + chunk_size
|
||||
if chunk_end > len(sequences) + 8:
|
||||
continue
|
||||
|
||||
cut_sequence = sequences[i : i + chunk_size]
|
||||
|
||||
if bom_or_sig_available and strip_sig_or_bom is False:
|
||||
cut_sequence = sig_payload + cut_sequence
|
||||
|
||||
chunk = cut_sequence.decode(
|
||||
encoding_iana,
|
||||
errors="ignore" if is_multi_byte_decoder else "strict",
|
||||
)
|
||||
|
||||
# multi-byte bad cutting detector and adjustment
|
||||
# not the cleanest way to perform that fix but clever enough for now.
|
||||
if is_multi_byte_decoder and i > 0 and sequences[i] >= 0x80:
|
||||
|
||||
chunk_partial_size_chk: int = min(chunk_size, 16)
|
||||
|
||||
if (
|
||||
decoded_payload
|
||||
and chunk[:chunk_partial_size_chk] not in decoded_payload
|
||||
):
|
||||
for j in range(i, i - 4, -1):
|
||||
cut_sequence = sequences[j:chunk_end]
|
||||
|
||||
if bom_or_sig_available and strip_sig_or_bom is False:
|
||||
cut_sequence = sig_payload + cut_sequence
|
||||
|
||||
chunk = cut_sequence.decode(encoding_iana, errors="ignore")
|
||||
|
||||
if chunk[:chunk_partial_size_chk] in decoded_payload:
|
||||
break
|
||||
|
||||
yield chunk
|
||||
@ -0,0 +1,6 @@
|
||||
"""
|
||||
Expose version
|
||||
"""
|
||||
|
||||
__version__ = "2.1.1"
|
||||
VERSION = __version__.split(".")
|
||||
8
venv/lib/python3.11/site-packages/dateutil/__init__.py
Normal file
8
venv/lib/python3.11/site-packages/dateutil/__init__.py
Normal file
@ -0,0 +1,8 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
try:
|
||||
from ._version import version as __version__
|
||||
except ImportError:
|
||||
__version__ = 'unknown'
|
||||
|
||||
__all__ = ['easter', 'parser', 'relativedelta', 'rrule', 'tz',
|
||||
'utils', 'zoneinfo']
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
43
venv/lib/python3.11/site-packages/dateutil/_common.py
Normal file
43
venv/lib/python3.11/site-packages/dateutil/_common.py
Normal file
@ -0,0 +1,43 @@
|
||||
"""
|
||||
Common code used in multiple modules.
|
||||
"""
|
||||
|
||||
|
||||
class weekday(object):
|
||||
__slots__ = ["weekday", "n"]
|
||||
|
||||
def __init__(self, weekday, n=None):
|
||||
self.weekday = weekday
|
||||
self.n = n
|
||||
|
||||
def __call__(self, n):
|
||||
if n == self.n:
|
||||
return self
|
||||
else:
|
||||
return self.__class__(self.weekday, n)
|
||||
|
||||
def __eq__(self, other):
|
||||
try:
|
||||
if self.weekday != other.weekday or self.n != other.n:
|
||||
return False
|
||||
except AttributeError:
|
||||
return False
|
||||
return True
|
||||
|
||||
def __hash__(self):
|
||||
return hash((
|
||||
self.weekday,
|
||||
self.n,
|
||||
))
|
||||
|
||||
def __ne__(self, other):
|
||||
return not (self == other)
|
||||
|
||||
def __repr__(self):
|
||||
s = ("MO", "TU", "WE", "TH", "FR", "SA", "SU")[self.weekday]
|
||||
if not self.n:
|
||||
return s
|
||||
else:
|
||||
return "%s(%+d)" % (s, self.n)
|
||||
|
||||
# vim:ts=4:sw=4:et
|
||||
5
venv/lib/python3.11/site-packages/dateutil/_version.py
Normal file
5
venv/lib/python3.11/site-packages/dateutil/_version.py
Normal file
@ -0,0 +1,5 @@
|
||||
# coding: utf-8
|
||||
# file generated by setuptools_scm
|
||||
# don't change, don't track in version control
|
||||
version = '2.8.2'
|
||||
version_tuple = (2, 8, 2)
|
||||
89
venv/lib/python3.11/site-packages/dateutil/easter.py
Normal file
89
venv/lib/python3.11/site-packages/dateutil/easter.py
Normal file
@ -0,0 +1,89 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
This module offers a generic Easter computing method for any given year, using
|
||||
Western, Orthodox or Julian algorithms.
|
||||
"""
|
||||
|
||||
import datetime
|
||||
|
||||
__all__ = ["easter", "EASTER_JULIAN", "EASTER_ORTHODOX", "EASTER_WESTERN"]
|
||||
|
||||
EASTER_JULIAN = 1
|
||||
EASTER_ORTHODOX = 2
|
||||
EASTER_WESTERN = 3
|
||||
|
||||
|
||||
def easter(year, method=EASTER_WESTERN):
|
||||
"""
|
||||
This method was ported from the work done by GM Arts,
|
||||
on top of the algorithm by Claus Tondering, which was
|
||||
based in part on the algorithm of Ouding (1940), as
|
||||
quoted in "Explanatory Supplement to the Astronomical
|
||||
Almanac", P. Kenneth Seidelmann, editor.
|
||||
|
||||
This algorithm implements three different Easter
|
||||
calculation methods:
|
||||
|
||||
1. Original calculation in Julian calendar, valid in
|
||||
dates after 326 AD
|
||||
2. Original method, with date converted to Gregorian
|
||||
calendar, valid in years 1583 to 4099
|
||||
3. Revised method, in Gregorian calendar, valid in
|
||||
years 1583 to 4099 as well
|
||||
|
||||
These methods are represented by the constants:
|
||||
|
||||
* ``EASTER_JULIAN = 1``
|
||||
* ``EASTER_ORTHODOX = 2``
|
||||
* ``EASTER_WESTERN = 3``
|
||||
|
||||
The default method is method 3.
|
||||
|
||||
More about the algorithm may be found at:
|
||||
|
||||
`GM Arts: Easter Algorithms <http://www.gmarts.org/index.php?go=415>`_
|
||||
|
||||
and
|
||||
|
||||
`The Calendar FAQ: Easter <https://www.tondering.dk/claus/cal/easter.php>`_
|
||||
|
||||
"""
|
||||
|
||||
if not (1 <= method <= 3):
|
||||
raise ValueError("invalid method")
|
||||
|
||||
# g - Golden year - 1
|
||||
# c - Century
|
||||
# h - (23 - Epact) mod 30
|
||||
# i - Number of days from March 21 to Paschal Full Moon
|
||||
# j - Weekday for PFM (0=Sunday, etc)
|
||||
# p - Number of days from March 21 to Sunday on or before PFM
|
||||
# (-6 to 28 methods 1 & 3, to 56 for method 2)
|
||||
# e - Extra days to add for method 2 (converting Julian
|
||||
# date to Gregorian date)
|
||||
|
||||
y = year
|
||||
g = y % 19
|
||||
e = 0
|
||||
if method < 3:
|
||||
# Old method
|
||||
i = (19*g + 15) % 30
|
||||
j = (y + y//4 + i) % 7
|
||||
if method == 2:
|
||||
# Extra dates to convert Julian to Gregorian date
|
||||
e = 10
|
||||
if y > 1600:
|
||||
e = e + y//100 - 16 - (y//100 - 16)//4
|
||||
else:
|
||||
# New method
|
||||
c = y//100
|
||||
h = (c - c//4 - (8*c + 13)//25 + 19*g + 15) % 30
|
||||
i = h - (h//28)*(1 - (h//28)*(29//(h + 1))*((21 - g)//11))
|
||||
j = (y + y//4 + i + 2 - c + c//4) % 7
|
||||
|
||||
# p can be from -6 to 56 corresponding to dates 22 March to 23 May
|
||||
# (later dates apply to method 2, although 23 May never actually occurs)
|
||||
p = i - j + e
|
||||
d = 1 + (p + 27 + (p + 6)//40) % 31
|
||||
m = 3 + (p + 26)//30
|
||||
return datetime.date(int(y), int(m), int(d))
|
||||
@ -0,0 +1,61 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from ._parser import parse, parser, parserinfo, ParserError
|
||||
from ._parser import DEFAULTPARSER, DEFAULTTZPARSER
|
||||
from ._parser import UnknownTimezoneWarning
|
||||
|
||||
from ._parser import __doc__
|
||||
|
||||
from .isoparser import isoparser, isoparse
|
||||
|
||||
__all__ = ['parse', 'parser', 'parserinfo',
|
||||
'isoparse', 'isoparser',
|
||||
'ParserError',
|
||||
'UnknownTimezoneWarning']
|
||||
|
||||
|
||||
###
|
||||
# Deprecate portions of the private interface so that downstream code that
|
||||
# is improperly relying on it is given *some* notice.
|
||||
|
||||
|
||||
def __deprecated_private_func(f):
|
||||
from functools import wraps
|
||||
import warnings
|
||||
|
||||
msg = ('{name} is a private function and may break without warning, '
|
||||
'it will be moved and or renamed in future versions.')
|
||||
msg = msg.format(name=f.__name__)
|
||||
|
||||
@wraps(f)
|
||||
def deprecated_func(*args, **kwargs):
|
||||
warnings.warn(msg, DeprecationWarning)
|
||||
return f(*args, **kwargs)
|
||||
|
||||
return deprecated_func
|
||||
|
||||
def __deprecate_private_class(c):
|
||||
import warnings
|
||||
|
||||
msg = ('{name} is a private class and may break without warning, '
|
||||
'it will be moved and or renamed in future versions.')
|
||||
msg = msg.format(name=c.__name__)
|
||||
|
||||
class private_class(c):
|
||||
__doc__ = c.__doc__
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
warnings.warn(msg, DeprecationWarning)
|
||||
super(private_class, self).__init__(*args, **kwargs)
|
||||
|
||||
private_class.__name__ = c.__name__
|
||||
|
||||
return private_class
|
||||
|
||||
|
||||
from ._parser import _timelex, _resultbase
|
||||
from ._parser import _tzparser, _parsetz
|
||||
|
||||
_timelex = __deprecate_private_class(_timelex)
|
||||
_tzparser = __deprecate_private_class(_tzparser)
|
||||
_resultbase = __deprecate_private_class(_resultbase)
|
||||
_parsetz = __deprecated_private_func(_parsetz)
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
1613
venv/lib/python3.11/site-packages/dateutil/parser/_parser.py
Normal file
1613
venv/lib/python3.11/site-packages/dateutil/parser/_parser.py
Normal file
File diff suppressed because it is too large
Load Diff
416
venv/lib/python3.11/site-packages/dateutil/parser/isoparser.py
Normal file
416
venv/lib/python3.11/site-packages/dateutil/parser/isoparser.py
Normal file
@ -0,0 +1,416 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
This module offers a parser for ISO-8601 strings
|
||||
|
||||
It is intended to support all valid date, time and datetime formats per the
|
||||
ISO-8601 specification.
|
||||
|
||||
..versionadded:: 2.7.0
|
||||
"""
|
||||
from datetime import datetime, timedelta, time, date
|
||||
import calendar
|
||||
from dateutil import tz
|
||||
|
||||
from functools import wraps
|
||||
|
||||
import re
|
||||
import six
|
||||
|
||||
__all__ = ["isoparse", "isoparser"]
|
||||
|
||||
|
||||
def _takes_ascii(f):
|
||||
@wraps(f)
|
||||
def func(self, str_in, *args, **kwargs):
|
||||
# If it's a stream, read the whole thing
|
||||
str_in = getattr(str_in, 'read', lambda: str_in)()
|
||||
|
||||
# If it's unicode, turn it into bytes, since ISO-8601 only covers ASCII
|
||||
if isinstance(str_in, six.text_type):
|
||||
# ASCII is the same in UTF-8
|
||||
try:
|
||||
str_in = str_in.encode('ascii')
|
||||
except UnicodeEncodeError as e:
|
||||
msg = 'ISO-8601 strings should contain only ASCII characters'
|
||||
six.raise_from(ValueError(msg), e)
|
||||
|
||||
return f(self, str_in, *args, **kwargs)
|
||||
|
||||
return func
|
||||
|
||||
|
||||
class isoparser(object):
|
||||
def __init__(self, sep=None):
|
||||
"""
|
||||
:param sep:
|
||||
A single character that separates date and time portions. If
|
||||
``None``, the parser will accept any single character.
|
||||
For strict ISO-8601 adherence, pass ``'T'``.
|
||||
"""
|
||||
if sep is not None:
|
||||
if (len(sep) != 1 or ord(sep) >= 128 or sep in '0123456789'):
|
||||
raise ValueError('Separator must be a single, non-numeric ' +
|
||||
'ASCII character')
|
||||
|
||||
sep = sep.encode('ascii')
|
||||
|
||||
self._sep = sep
|
||||
|
||||
@_takes_ascii
|
||||
def isoparse(self, dt_str):
|
||||
"""
|
||||
Parse an ISO-8601 datetime string into a :class:`datetime.datetime`.
|
||||
|
||||
An ISO-8601 datetime string consists of a date portion, followed
|
||||
optionally by a time portion - the date and time portions are separated
|
||||
by a single character separator, which is ``T`` in the official
|
||||
standard. Incomplete date formats (such as ``YYYY-MM``) may *not* be
|
||||
combined with a time portion.
|
||||
|
||||
Supported date formats are:
|
||||
|
||||
Common:
|
||||
|
||||
- ``YYYY``
|
||||
- ``YYYY-MM`` or ``YYYYMM``
|
||||
- ``YYYY-MM-DD`` or ``YYYYMMDD``
|
||||
|
||||
Uncommon:
|
||||
|
||||
- ``YYYY-Www`` or ``YYYYWww`` - ISO week (day defaults to 0)
|
||||
- ``YYYY-Www-D`` or ``YYYYWwwD`` - ISO week and day
|
||||
|
||||
The ISO week and day numbering follows the same logic as
|
||||
:func:`datetime.date.isocalendar`.
|
||||
|
||||
Supported time formats are:
|
||||
|
||||
- ``hh``
|
||||
- ``hh:mm`` or ``hhmm``
|
||||
- ``hh:mm:ss`` or ``hhmmss``
|
||||
- ``hh:mm:ss.ssssss`` (Up to 6 sub-second digits)
|
||||
|
||||
Midnight is a special case for `hh`, as the standard supports both
|
||||
00:00 and 24:00 as a representation. The decimal separator can be
|
||||
either a dot or a comma.
|
||||
|
||||
|
||||
.. caution::
|
||||
|
||||
Support for fractional components other than seconds is part of the
|
||||
ISO-8601 standard, but is not currently implemented in this parser.
|
||||
|
||||
Supported time zone offset formats are:
|
||||
|
||||
- `Z` (UTC)
|
||||
- `±HH:MM`
|
||||
- `±HHMM`
|
||||
- `±HH`
|
||||
|
||||
Offsets will be represented as :class:`dateutil.tz.tzoffset` objects,
|
||||
with the exception of UTC, which will be represented as
|
||||
:class:`dateutil.tz.tzutc`. Time zone offsets equivalent to UTC (such
|
||||
as `+00:00`) will also be represented as :class:`dateutil.tz.tzutc`.
|
||||
|
||||
:param dt_str:
|
||||
A string or stream containing only an ISO-8601 datetime string
|
||||
|
||||
:return:
|
||||
Returns a :class:`datetime.datetime` representing the string.
|
||||
Unspecified components default to their lowest value.
|
||||
|
||||
.. warning::
|
||||
|
||||
As of version 2.7.0, the strictness of the parser should not be
|
||||
considered a stable part of the contract. Any valid ISO-8601 string
|
||||
that parses correctly with the default settings will continue to
|
||||
parse correctly in future versions, but invalid strings that
|
||||
currently fail (e.g. ``2017-01-01T00:00+00:00:00``) are not
|
||||
guaranteed to continue failing in future versions if they encode
|
||||
a valid date.
|
||||
|
||||
.. versionadded:: 2.7.0
|
||||
"""
|
||||
components, pos = self._parse_isodate(dt_str)
|
||||
|
||||
if len(dt_str) > pos:
|
||||
if self._sep is None or dt_str[pos:pos + 1] == self._sep:
|
||||
components += self._parse_isotime(dt_str[pos + 1:])
|
||||
else:
|
||||
raise ValueError('String contains unknown ISO components')
|
||||
|
||||
if len(components) > 3 and components[3] == 24:
|
||||
components[3] = 0
|
||||
return datetime(*components) + timedelta(days=1)
|
||||
|
||||
return datetime(*components)
|
||||
|
||||
@_takes_ascii
|
||||
def parse_isodate(self, datestr):
|
||||
"""
|
||||
Parse the date portion of an ISO string.
|
||||
|
||||
:param datestr:
|
||||
The string portion of an ISO string, without a separator
|
||||
|
||||
:return:
|
||||
Returns a :class:`datetime.date` object
|
||||
"""
|
||||
components, pos = self._parse_isodate(datestr)
|
||||
if pos < len(datestr):
|
||||
raise ValueError('String contains unknown ISO ' +
|
||||
'components: {!r}'.format(datestr.decode('ascii')))
|
||||
return date(*components)
|
||||
|
||||
@_takes_ascii
|
||||
def parse_isotime(self, timestr):
|
||||
"""
|
||||
Parse the time portion of an ISO string.
|
||||
|
||||
:param timestr:
|
||||
The time portion of an ISO string, without a separator
|
||||
|
||||
:return:
|
||||
Returns a :class:`datetime.time` object
|
||||
"""
|
||||
components = self._parse_isotime(timestr)
|
||||
if components[0] == 24:
|
||||
components[0] = 0
|
||||
return time(*components)
|
||||
|
||||
@_takes_ascii
|
||||
def parse_tzstr(self, tzstr, zero_as_utc=True):
|
||||
"""
|
||||
Parse a valid ISO time zone string.
|
||||
|
||||
See :func:`isoparser.isoparse` for details on supported formats.
|
||||
|
||||
:param tzstr:
|
||||
A string representing an ISO time zone offset
|
||||
|
||||
:param zero_as_utc:
|
||||
Whether to return :class:`dateutil.tz.tzutc` for zero-offset zones
|
||||
|
||||
:return:
|
||||
Returns :class:`dateutil.tz.tzoffset` for offsets and
|
||||
:class:`dateutil.tz.tzutc` for ``Z`` and (if ``zero_as_utc`` is
|
||||
specified) offsets equivalent to UTC.
|
||||
"""
|
||||
return self._parse_tzstr(tzstr, zero_as_utc=zero_as_utc)
|
||||
|
||||
# Constants
|
||||
_DATE_SEP = b'-'
|
||||
_TIME_SEP = b':'
|
||||
_FRACTION_REGEX = re.compile(b'[\\.,]([0-9]+)')
|
||||
|
||||
def _parse_isodate(self, dt_str):
|
||||
try:
|
||||
return self._parse_isodate_common(dt_str)
|
||||
except ValueError:
|
||||
return self._parse_isodate_uncommon(dt_str)
|
||||
|
||||
def _parse_isodate_common(self, dt_str):
|
||||
len_str = len(dt_str)
|
||||
components = [1, 1, 1]
|
||||
|
||||
if len_str < 4:
|
||||
raise ValueError('ISO string too short')
|
||||
|
||||
# Year
|
||||
components[0] = int(dt_str[0:4])
|
||||
pos = 4
|
||||
if pos >= len_str:
|
||||
return components, pos
|
||||
|
||||
has_sep = dt_str[pos:pos + 1] == self._DATE_SEP
|
||||
if has_sep:
|
||||
pos += 1
|
||||
|
||||
# Month
|
||||
if len_str - pos < 2:
|
||||
raise ValueError('Invalid common month')
|
||||
|
||||
components[1] = int(dt_str[pos:pos + 2])
|
||||
pos += 2
|
||||
|
||||
if pos >= len_str:
|
||||
if has_sep:
|
||||
return components, pos
|
||||
else:
|
||||
raise ValueError('Invalid ISO format')
|
||||
|
||||
if has_sep:
|
||||
if dt_str[pos:pos + 1] != self._DATE_SEP:
|
||||
raise ValueError('Invalid separator in ISO string')
|
||||
pos += 1
|
||||
|
||||
# Day
|
||||
if len_str - pos < 2:
|
||||
raise ValueError('Invalid common day')
|
||||
components[2] = int(dt_str[pos:pos + 2])
|
||||
return components, pos + 2
|
||||
|
||||
def _parse_isodate_uncommon(self, dt_str):
|
||||
if len(dt_str) < 4:
|
||||
raise ValueError('ISO string too short')
|
||||
|
||||
# All ISO formats start with the year
|
||||
year = int(dt_str[0:4])
|
||||
|
||||
has_sep = dt_str[4:5] == self._DATE_SEP
|
||||
|
||||
pos = 4 + has_sep # Skip '-' if it's there
|
||||
if dt_str[pos:pos + 1] == b'W':
|
||||
# YYYY-?Www-?D?
|
||||
pos += 1
|
||||
weekno = int(dt_str[pos:pos + 2])
|
||||
pos += 2
|
||||
|
||||
dayno = 1
|
||||
if len(dt_str) > pos:
|
||||
if (dt_str[pos:pos + 1] == self._DATE_SEP) != has_sep:
|
||||
raise ValueError('Inconsistent use of dash separator')
|
||||
|
||||
pos += has_sep
|
||||
|
||||
dayno = int(dt_str[pos:pos + 1])
|
||||
pos += 1
|
||||
|
||||
base_date = self._calculate_weekdate(year, weekno, dayno)
|
||||
else:
|
||||
# YYYYDDD or YYYY-DDD
|
||||
if len(dt_str) - pos < 3:
|
||||
raise ValueError('Invalid ordinal day')
|
||||
|
||||
ordinal_day = int(dt_str[pos:pos + 3])
|
||||
pos += 3
|
||||
|
||||
if ordinal_day < 1 or ordinal_day > (365 + calendar.isleap(year)):
|
||||
raise ValueError('Invalid ordinal day' +
|
||||
' {} for year {}'.format(ordinal_day, year))
|
||||
|
||||
base_date = date(year, 1, 1) + timedelta(days=ordinal_day - 1)
|
||||
|
||||
components = [base_date.year, base_date.month, base_date.day]
|
||||
return components, pos
|
||||
|
||||
def _calculate_weekdate(self, year, week, day):
|
||||
"""
|
||||
Calculate the day of corresponding to the ISO year-week-day calendar.
|
||||
|
||||
This function is effectively the inverse of
|
||||
:func:`datetime.date.isocalendar`.
|
||||
|
||||
:param year:
|
||||
The year in the ISO calendar
|
||||
|
||||
:param week:
|
||||
The week in the ISO calendar - range is [1, 53]
|
||||
|
||||
:param day:
|
||||
The day in the ISO calendar - range is [1 (MON), 7 (SUN)]
|
||||
|
||||
:return:
|
||||
Returns a :class:`datetime.date`
|
||||
"""
|
||||
if not 0 < week < 54:
|
||||
raise ValueError('Invalid week: {}'.format(week))
|
||||
|
||||
if not 0 < day < 8: # Range is 1-7
|
||||
raise ValueError('Invalid weekday: {}'.format(day))
|
||||
|
||||
# Get week 1 for the specific year:
|
||||
jan_4 = date(year, 1, 4) # Week 1 always has January 4th in it
|
||||
week_1 = jan_4 - timedelta(days=jan_4.isocalendar()[2] - 1)
|
||||
|
||||
# Now add the specific number of weeks and days to get what we want
|
||||
week_offset = (week - 1) * 7 + (day - 1)
|
||||
return week_1 + timedelta(days=week_offset)
|
||||
|
||||
def _parse_isotime(self, timestr):
|
||||
len_str = len(timestr)
|
||||
components = [0, 0, 0, 0, None]
|
||||
pos = 0
|
||||
comp = -1
|
||||
|
||||
if len_str < 2:
|
||||
raise ValueError('ISO time too short')
|
||||
|
||||
has_sep = False
|
||||
|
||||
while pos < len_str and comp < 5:
|
||||
comp += 1
|
||||
|
||||
if timestr[pos:pos + 1] in b'-+Zz':
|
||||
# Detect time zone boundary
|
||||
components[-1] = self._parse_tzstr(timestr[pos:])
|
||||
pos = len_str
|
||||
break
|
||||
|
||||
if comp == 1 and timestr[pos:pos+1] == self._TIME_SEP:
|
||||
has_sep = True
|
||||
pos += 1
|
||||
elif comp == 2 and has_sep:
|
||||
if timestr[pos:pos+1] != self._TIME_SEP:
|
||||
raise ValueError('Inconsistent use of colon separator')
|
||||
pos += 1
|
||||
|
||||
if comp < 3:
|
||||
# Hour, minute, second
|
||||
components[comp] = int(timestr[pos:pos + 2])
|
||||
pos += 2
|
||||
|
||||
if comp == 3:
|
||||
# Fraction of a second
|
||||
frac = self._FRACTION_REGEX.match(timestr[pos:])
|
||||
if not frac:
|
||||
continue
|
||||
|
||||
us_str = frac.group(1)[:6] # Truncate to microseconds
|
||||
components[comp] = int(us_str) * 10**(6 - len(us_str))
|
||||
pos += len(frac.group())
|
||||
|
||||
if pos < len_str:
|
||||
raise ValueError('Unused components in ISO string')
|
||||
|
||||
if components[0] == 24:
|
||||
# Standard supports 00:00 and 24:00 as representations of midnight
|
||||
if any(component != 0 for component in components[1:4]):
|
||||
raise ValueError('Hour may only be 24 at 24:00:00.000')
|
||||
|
||||
return components
|
||||
|
||||
def _parse_tzstr(self, tzstr, zero_as_utc=True):
|
||||
if tzstr == b'Z' or tzstr == b'z':
|
||||
return tz.UTC
|
||||
|
||||
if len(tzstr) not in {3, 5, 6}:
|
||||
raise ValueError('Time zone offset must be 1, 3, 5 or 6 characters')
|
||||
|
||||
if tzstr[0:1] == b'-':
|
||||
mult = -1
|
||||
elif tzstr[0:1] == b'+':
|
||||
mult = 1
|
||||
else:
|
||||
raise ValueError('Time zone offset requires sign')
|
||||
|
||||
hours = int(tzstr[1:3])
|
||||
if len(tzstr) == 3:
|
||||
minutes = 0
|
||||
else:
|
||||
minutes = int(tzstr[(4 if tzstr[3:4] == self._TIME_SEP else 3):])
|
||||
|
||||
if zero_as_utc and hours == 0 and minutes == 0:
|
||||
return tz.UTC
|
||||
else:
|
||||
if minutes > 59:
|
||||
raise ValueError('Invalid minutes in time zone offset')
|
||||
|
||||
if hours > 23:
|
||||
raise ValueError('Invalid hours in time zone offset')
|
||||
|
||||
return tz.tzoffset(None, mult * (hours * 60 + minutes) * 60)
|
||||
|
||||
|
||||
DEFAULT_ISOPARSER = isoparser()
|
||||
isoparse = DEFAULT_ISOPARSER.isoparse
|
||||
599
venv/lib/python3.11/site-packages/dateutil/relativedelta.py
Normal file
599
venv/lib/python3.11/site-packages/dateutil/relativedelta.py
Normal file
@ -0,0 +1,599 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
import datetime
|
||||
import calendar
|
||||
|
||||
import operator
|
||||
from math import copysign
|
||||
|
||||
from six import integer_types
|
||||
from warnings import warn
|
||||
|
||||
from ._common import weekday
|
||||
|
||||
MO, TU, WE, TH, FR, SA, SU = weekdays = tuple(weekday(x) for x in range(7))
|
||||
|
||||
__all__ = ["relativedelta", "MO", "TU", "WE", "TH", "FR", "SA", "SU"]
|
||||
|
||||
|
||||
class relativedelta(object):
|
||||
"""
|
||||
The relativedelta type is designed to be applied to an existing datetime and
|
||||
can replace specific components of that datetime, or represents an interval
|
||||
of time.
|
||||
|
||||
It is based on the specification of the excellent work done by M.-A. Lemburg
|
||||
in his
|
||||
`mx.DateTime <https://www.egenix.com/products/python/mxBase/mxDateTime/>`_ extension.
|
||||
However, notice that this type does *NOT* implement the same algorithm as
|
||||
his work. Do *NOT* expect it to behave like mx.DateTime's counterpart.
|
||||
|
||||
There are two different ways to build a relativedelta instance. The
|
||||
first one is passing it two date/datetime classes::
|
||||
|
||||
relativedelta(datetime1, datetime2)
|
||||
|
||||
The second one is passing it any number of the following keyword arguments::
|
||||
|
||||
relativedelta(arg1=x,arg2=y,arg3=z...)
|
||||
|
||||
year, month, day, hour, minute, second, microsecond:
|
||||
Absolute information (argument is singular); adding or subtracting a
|
||||
relativedelta with absolute information does not perform an arithmetic
|
||||
operation, but rather REPLACES the corresponding value in the
|
||||
original datetime with the value(s) in relativedelta.
|
||||
|
||||
years, months, weeks, days, hours, minutes, seconds, microseconds:
|
||||
Relative information, may be negative (argument is plural); adding
|
||||
or subtracting a relativedelta with relative information performs
|
||||
the corresponding arithmetic operation on the original datetime value
|
||||
with the information in the relativedelta.
|
||||
|
||||
weekday:
|
||||
One of the weekday instances (MO, TU, etc) available in the
|
||||
relativedelta module. These instances may receive a parameter N,
|
||||
specifying the Nth weekday, which could be positive or negative
|
||||
(like MO(+1) or MO(-2)). Not specifying it is the same as specifying
|
||||
+1. You can also use an integer, where 0=MO. This argument is always
|
||||
relative e.g. if the calculated date is already Monday, using MO(1)
|
||||
or MO(-1) won't change the day. To effectively make it absolute, use
|
||||
it in combination with the day argument (e.g. day=1, MO(1) for first
|
||||
Monday of the month).
|
||||
|
||||
leapdays:
|
||||
Will add given days to the date found, if year is a leap
|
||||
year, and the date found is post 28 of february.
|
||||
|
||||
yearday, nlyearday:
|
||||
Set the yearday or the non-leap year day (jump leap days).
|
||||
These are converted to day/month/leapdays information.
|
||||
|
||||
There are relative and absolute forms of the keyword
|
||||
arguments. The plural is relative, and the singular is
|
||||
absolute. For each argument in the order below, the absolute form
|
||||
is applied first (by setting each attribute to that value) and
|
||||
then the relative form (by adding the value to the attribute).
|
||||
|
||||
The order of attributes considered when this relativedelta is
|
||||
added to a datetime is:
|
||||
|
||||
1. Year
|
||||
2. Month
|
||||
3. Day
|
||||
4. Hours
|
||||
5. Minutes
|
||||
6. Seconds
|
||||
7. Microseconds
|
||||
|
||||
Finally, weekday is applied, using the rule described above.
|
||||
|
||||
For example
|
||||
|
||||
>>> from datetime import datetime
|
||||
>>> from dateutil.relativedelta import relativedelta, MO
|
||||
>>> dt = datetime(2018, 4, 9, 13, 37, 0)
|
||||
>>> delta = relativedelta(hours=25, day=1, weekday=MO(1))
|
||||
>>> dt + delta
|
||||
datetime.datetime(2018, 4, 2, 14, 37)
|
||||
|
||||
First, the day is set to 1 (the first of the month), then 25 hours
|
||||
are added, to get to the 2nd day and 14th hour, finally the
|
||||
weekday is applied, but since the 2nd is already a Monday there is
|
||||
no effect.
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, dt1=None, dt2=None,
|
||||
years=0, months=0, days=0, leapdays=0, weeks=0,
|
||||
hours=0, minutes=0, seconds=0, microseconds=0,
|
||||
year=None, month=None, day=None, weekday=None,
|
||||
yearday=None, nlyearday=None,
|
||||
hour=None, minute=None, second=None, microsecond=None):
|
||||
|
||||
if dt1 and dt2:
|
||||
# datetime is a subclass of date. So both must be date
|
||||
if not (isinstance(dt1, datetime.date) and
|
||||
isinstance(dt2, datetime.date)):
|
||||
raise TypeError("relativedelta only diffs datetime/date")
|
||||
|
||||
# We allow two dates, or two datetimes, so we coerce them to be
|
||||
# of the same type
|
||||
if (isinstance(dt1, datetime.datetime) !=
|
||||
isinstance(dt2, datetime.datetime)):
|
||||
if not isinstance(dt1, datetime.datetime):
|
||||
dt1 = datetime.datetime.fromordinal(dt1.toordinal())
|
||||
elif not isinstance(dt2, datetime.datetime):
|
||||
dt2 = datetime.datetime.fromordinal(dt2.toordinal())
|
||||
|
||||
self.years = 0
|
||||
self.months = 0
|
||||
self.days = 0
|
||||
self.leapdays = 0
|
||||
self.hours = 0
|
||||
self.minutes = 0
|
||||
self.seconds = 0
|
||||
self.microseconds = 0
|
||||
self.year = None
|
||||
self.month = None
|
||||
self.day = None
|
||||
self.weekday = None
|
||||
self.hour = None
|
||||
self.minute = None
|
||||
self.second = None
|
||||
self.microsecond = None
|
||||
self._has_time = 0
|
||||
|
||||
# Get year / month delta between the two
|
||||
months = (dt1.year - dt2.year) * 12 + (dt1.month - dt2.month)
|
||||
self._set_months(months)
|
||||
|
||||
# Remove the year/month delta so the timedelta is just well-defined
|
||||
# time units (seconds, days and microseconds)
|
||||
dtm = self.__radd__(dt2)
|
||||
|
||||
# If we've overshot our target, make an adjustment
|
||||
if dt1 < dt2:
|
||||
compare = operator.gt
|
||||
increment = 1
|
||||
else:
|
||||
compare = operator.lt
|
||||
increment = -1
|
||||
|
||||
while compare(dt1, dtm):
|
||||
months += increment
|
||||
self._set_months(months)
|
||||
dtm = self.__radd__(dt2)
|
||||
|
||||
# Get the timedelta between the "months-adjusted" date and dt1
|
||||
delta = dt1 - dtm
|
||||
self.seconds = delta.seconds + delta.days * 86400
|
||||
self.microseconds = delta.microseconds
|
||||
else:
|
||||
# Check for non-integer values in integer-only quantities
|
||||
if any(x is not None and x != int(x) for x in (years, months)):
|
||||
raise ValueError("Non-integer years and months are "
|
||||
"ambiguous and not currently supported.")
|
||||
|
||||
# Relative information
|
||||
self.years = int(years)
|
||||
self.months = int(months)
|
||||
self.days = days + weeks * 7
|
||||
self.leapdays = leapdays
|
||||
self.hours = hours
|
||||
self.minutes = minutes
|
||||
self.seconds = seconds
|
||||
self.microseconds = microseconds
|
||||
|
||||
# Absolute information
|
||||
self.year = year
|
||||
self.month = month
|
||||
self.day = day
|
||||
self.hour = hour
|
||||
self.minute = minute
|
||||
self.second = second
|
||||
self.microsecond = microsecond
|
||||
|
||||
if any(x is not None and int(x) != x
|
||||
for x in (year, month, day, hour,
|
||||
minute, second, microsecond)):
|
||||
# For now we'll deprecate floats - later it'll be an error.
|
||||
warn("Non-integer value passed as absolute information. " +
|
||||
"This is not a well-defined condition and will raise " +
|
||||
"errors in future versions.", DeprecationWarning)
|
||||
|
||||
if isinstance(weekday, integer_types):
|
||||
self.weekday = weekdays[weekday]
|
||||
else:
|
||||
self.weekday = weekday
|
||||
|
||||
yday = 0
|
||||
if nlyearday:
|
||||
yday = nlyearday
|
||||
elif yearday:
|
||||
yday = yearday
|
||||
if yearday > 59:
|
||||
self.leapdays = -1
|
||||
if yday:
|
||||
ydayidx = [31, 59, 90, 120, 151, 181, 212,
|
||||
243, 273, 304, 334, 366]
|
||||
for idx, ydays in enumerate(ydayidx):
|
||||
if yday <= ydays:
|
||||
self.month = idx+1
|
||||
if idx == 0:
|
||||
self.day = yday
|
||||
else:
|
||||
self.day = yday-ydayidx[idx-1]
|
||||
break
|
||||
else:
|
||||
raise ValueError("invalid year day (%d)" % yday)
|
||||
|
||||
self._fix()
|
||||
|
||||
def _fix(self):
|
||||
if abs(self.microseconds) > 999999:
|
||||
s = _sign(self.microseconds)
|
||||
div, mod = divmod(self.microseconds * s, 1000000)
|
||||
self.microseconds = mod * s
|
||||
self.seconds += div * s
|
||||
if abs(self.seconds) > 59:
|
||||
s = _sign(self.seconds)
|
||||
div, mod = divmod(self.seconds * s, 60)
|
||||
self.seconds = mod * s
|
||||
self.minutes += div * s
|
||||
if abs(self.minutes) > 59:
|
||||
s = _sign(self.minutes)
|
||||
div, mod = divmod(self.minutes * s, 60)
|
||||
self.minutes = mod * s
|
||||
self.hours += div * s
|
||||
if abs(self.hours) > 23:
|
||||
s = _sign(self.hours)
|
||||
div, mod = divmod(self.hours * s, 24)
|
||||
self.hours = mod * s
|
||||
self.days += div * s
|
||||
if abs(self.months) > 11:
|
||||
s = _sign(self.months)
|
||||
div, mod = divmod(self.months * s, 12)
|
||||
self.months = mod * s
|
||||
self.years += div * s
|
||||
if (self.hours or self.minutes or self.seconds or self.microseconds
|
||||
or self.hour is not None or self.minute is not None or
|
||||
self.second is not None or self.microsecond is not None):
|
||||
self._has_time = 1
|
||||
else:
|
||||
self._has_time = 0
|
||||
|
||||
@property
|
||||
def weeks(self):
|
||||
return int(self.days / 7.0)
|
||||
|
||||
@weeks.setter
|
||||
def weeks(self, value):
|
||||
self.days = self.days - (self.weeks * 7) + value * 7
|
||||
|
||||
def _set_months(self, months):
|
||||
self.months = months
|
||||
if abs(self.months) > 11:
|
||||
s = _sign(self.months)
|
||||
div, mod = divmod(self.months * s, 12)
|
||||
self.months = mod * s
|
||||
self.years = div * s
|
||||
else:
|
||||
self.years = 0
|
||||
|
||||
def normalized(self):
|
||||
"""
|
||||
Return a version of this object represented entirely using integer
|
||||
values for the relative attributes.
|
||||
|
||||
>>> relativedelta(days=1.5, hours=2).normalized()
|
||||
relativedelta(days=+1, hours=+14)
|
||||
|
||||
:return:
|
||||
Returns a :class:`dateutil.relativedelta.relativedelta` object.
|
||||
"""
|
||||
# Cascade remainders down (rounding each to roughly nearest microsecond)
|
||||
days = int(self.days)
|
||||
|
||||
hours_f = round(self.hours + 24 * (self.days - days), 11)
|
||||
hours = int(hours_f)
|
||||
|
||||
minutes_f = round(self.minutes + 60 * (hours_f - hours), 10)
|
||||
minutes = int(minutes_f)
|
||||
|
||||
seconds_f = round(self.seconds + 60 * (minutes_f - minutes), 8)
|
||||
seconds = int(seconds_f)
|
||||
|
||||
microseconds = round(self.microseconds + 1e6 * (seconds_f - seconds))
|
||||
|
||||
# Constructor carries overflow back up with call to _fix()
|
||||
return self.__class__(years=self.years, months=self.months,
|
||||
days=days, hours=hours, minutes=minutes,
|
||||
seconds=seconds, microseconds=microseconds,
|
||||
leapdays=self.leapdays, year=self.year,
|
||||
month=self.month, day=self.day,
|
||||
weekday=self.weekday, hour=self.hour,
|
||||
minute=self.minute, second=self.second,
|
||||
microsecond=self.microsecond)
|
||||
|
||||
def __add__(self, other):
|
||||
if isinstance(other, relativedelta):
|
||||
return self.__class__(years=other.years + self.years,
|
||||
months=other.months + self.months,
|
||||
days=other.days + self.days,
|
||||
hours=other.hours + self.hours,
|
||||
minutes=other.minutes + self.minutes,
|
||||
seconds=other.seconds + self.seconds,
|
||||
microseconds=(other.microseconds +
|
||||
self.microseconds),
|
||||
leapdays=other.leapdays or self.leapdays,
|
||||
year=(other.year if other.year is not None
|
||||
else self.year),
|
||||
month=(other.month if other.month is not None
|
||||
else self.month),
|
||||
day=(other.day if other.day is not None
|
||||
else self.day),
|
||||
weekday=(other.weekday if other.weekday is not None
|
||||
else self.weekday),
|
||||
hour=(other.hour if other.hour is not None
|
||||
else self.hour),
|
||||
minute=(other.minute if other.minute is not None
|
||||
else self.minute),
|
||||
second=(other.second if other.second is not None
|
||||
else self.second),
|
||||
microsecond=(other.microsecond if other.microsecond
|
||||
is not None else
|
||||
self.microsecond))
|
||||
if isinstance(other, datetime.timedelta):
|
||||
return self.__class__(years=self.years,
|
||||
months=self.months,
|
||||
days=self.days + other.days,
|
||||
hours=self.hours,
|
||||
minutes=self.minutes,
|
||||
seconds=self.seconds + other.seconds,
|
||||
microseconds=self.microseconds + other.microseconds,
|
||||
leapdays=self.leapdays,
|
||||
year=self.year,
|
||||
month=self.month,
|
||||
day=self.day,
|
||||
weekday=self.weekday,
|
||||
hour=self.hour,
|
||||
minute=self.minute,
|
||||
second=self.second,
|
||||
microsecond=self.microsecond)
|
||||
if not isinstance(other, datetime.date):
|
||||
return NotImplemented
|
||||
elif self._has_time and not isinstance(other, datetime.datetime):
|
||||
other = datetime.datetime.fromordinal(other.toordinal())
|
||||
year = (self.year or other.year)+self.years
|
||||
month = self.month or other.month
|
||||
if self.months:
|
||||
assert 1 <= abs(self.months) <= 12
|
||||
month += self.months
|
||||
if month > 12:
|
||||
year += 1
|
||||
month -= 12
|
||||
elif month < 1:
|
||||
year -= 1
|
||||
month += 12
|
||||
day = min(calendar.monthrange(year, month)[1],
|
||||
self.day or other.day)
|
||||
repl = {"year": year, "month": month, "day": day}
|
||||
for attr in ["hour", "minute", "second", "microsecond"]:
|
||||
value = getattr(self, attr)
|
||||
if value is not None:
|
||||
repl[attr] = value
|
||||
days = self.days
|
||||
if self.leapdays and month > 2 and calendar.isleap(year):
|
||||
days += self.leapdays
|
||||
ret = (other.replace(**repl)
|
||||
+ datetime.timedelta(days=days,
|
||||
hours=self.hours,
|
||||
minutes=self.minutes,
|
||||
seconds=self.seconds,
|
||||
microseconds=self.microseconds))
|
||||
if self.weekday:
|
||||
weekday, nth = self.weekday.weekday, self.weekday.n or 1
|
||||
jumpdays = (abs(nth) - 1) * 7
|
||||
if nth > 0:
|
||||
jumpdays += (7 - ret.weekday() + weekday) % 7
|
||||
else:
|
||||
jumpdays += (ret.weekday() - weekday) % 7
|
||||
jumpdays *= -1
|
||||
ret += datetime.timedelta(days=jumpdays)
|
||||
return ret
|
||||
|
||||
def __radd__(self, other):
|
||||
return self.__add__(other)
|
||||
|
||||
def __rsub__(self, other):
|
||||
return self.__neg__().__radd__(other)
|
||||
|
||||
def __sub__(self, other):
|
||||
if not isinstance(other, relativedelta):
|
||||
return NotImplemented # In case the other object defines __rsub__
|
||||
return self.__class__(years=self.years - other.years,
|
||||
months=self.months - other.months,
|
||||
days=self.days - other.days,
|
||||
hours=self.hours - other.hours,
|
||||
minutes=self.minutes - other.minutes,
|
||||
seconds=self.seconds - other.seconds,
|
||||
microseconds=self.microseconds - other.microseconds,
|
||||
leapdays=self.leapdays or other.leapdays,
|
||||
year=(self.year if self.year is not None
|
||||
else other.year),
|
||||
month=(self.month if self.month is not None else
|
||||
other.month),
|
||||
day=(self.day if self.day is not None else
|
||||
other.day),
|
||||
weekday=(self.weekday if self.weekday is not None else
|
||||
other.weekday),
|
||||
hour=(self.hour if self.hour is not None else
|
||||
other.hour),
|
||||
minute=(self.minute if self.minute is not None else
|
||||
other.minute),
|
||||
second=(self.second if self.second is not None else
|
||||
other.second),
|
||||
microsecond=(self.microsecond if self.microsecond
|
||||
is not None else
|
||||
other.microsecond))
|
||||
|
||||
def __abs__(self):
|
||||
return self.__class__(years=abs(self.years),
|
||||
months=abs(self.months),
|
||||
days=abs(self.days),
|
||||
hours=abs(self.hours),
|
||||
minutes=abs(self.minutes),
|
||||
seconds=abs(self.seconds),
|
||||
microseconds=abs(self.microseconds),
|
||||
leapdays=self.leapdays,
|
||||
year=self.year,
|
||||
month=self.month,
|
||||
day=self.day,
|
||||
weekday=self.weekday,
|
||||
hour=self.hour,
|
||||
minute=self.minute,
|
||||
second=self.second,
|
||||
microsecond=self.microsecond)
|
||||
|
||||
def __neg__(self):
|
||||
return self.__class__(years=-self.years,
|
||||
months=-self.months,
|
||||
days=-self.days,
|
||||
hours=-self.hours,
|
||||
minutes=-self.minutes,
|
||||
seconds=-self.seconds,
|
||||
microseconds=-self.microseconds,
|
||||
leapdays=self.leapdays,
|
||||
year=self.year,
|
||||
month=self.month,
|
||||
day=self.day,
|
||||
weekday=self.weekday,
|
||||
hour=self.hour,
|
||||
minute=self.minute,
|
||||
second=self.second,
|
||||
microsecond=self.microsecond)
|
||||
|
||||
def __bool__(self):
|
||||
return not (not self.years and
|
||||
not self.months and
|
||||
not self.days and
|
||||
not self.hours and
|
||||
not self.minutes and
|
||||
not self.seconds and
|
||||
not self.microseconds and
|
||||
not self.leapdays and
|
||||
self.year is None and
|
||||
self.month is None and
|
||||
self.day is None and
|
||||
self.weekday is None and
|
||||
self.hour is None and
|
||||
self.minute is None and
|
||||
self.second is None and
|
||||
self.microsecond is None)
|
||||
# Compatibility with Python 2.x
|
||||
__nonzero__ = __bool__
|
||||
|
||||
def __mul__(self, other):
|
||||
try:
|
||||
f = float(other)
|
||||
except TypeError:
|
||||
return NotImplemented
|
||||
|
||||
return self.__class__(years=int(self.years * f),
|
||||
months=int(self.months * f),
|
||||
days=int(self.days * f),
|
||||
hours=int(self.hours * f),
|
||||
minutes=int(self.minutes * f),
|
||||
seconds=int(self.seconds * f),
|
||||
microseconds=int(self.microseconds * f),
|
||||
leapdays=self.leapdays,
|
||||
year=self.year,
|
||||
month=self.month,
|
||||
day=self.day,
|
||||
weekday=self.weekday,
|
||||
hour=self.hour,
|
||||
minute=self.minute,
|
||||
second=self.second,
|
||||
microsecond=self.microsecond)
|
||||
|
||||
__rmul__ = __mul__
|
||||
|
||||
def __eq__(self, other):
|
||||
if not isinstance(other, relativedelta):
|
||||
return NotImplemented
|
||||
if self.weekday or other.weekday:
|
||||
if not self.weekday or not other.weekday:
|
||||
return False
|
||||
if self.weekday.weekday != other.weekday.weekday:
|
||||
return False
|
||||
n1, n2 = self.weekday.n, other.weekday.n
|
||||
if n1 != n2 and not ((not n1 or n1 == 1) and (not n2 or n2 == 1)):
|
||||
return False
|
||||
return (self.years == other.years and
|
||||
self.months == other.months and
|
||||
self.days == other.days and
|
||||
self.hours == other.hours and
|
||||
self.minutes == other.minutes and
|
||||
self.seconds == other.seconds and
|
||||
self.microseconds == other.microseconds and
|
||||
self.leapdays == other.leapdays and
|
||||
self.year == other.year and
|
||||
self.month == other.month and
|
||||
self.day == other.day and
|
||||
self.hour == other.hour and
|
||||
self.minute == other.minute and
|
||||
self.second == other.second and
|
||||
self.microsecond == other.microsecond)
|
||||
|
||||
def __hash__(self):
|
||||
return hash((
|
||||
self.weekday,
|
||||
self.years,
|
||||
self.months,
|
||||
self.days,
|
||||
self.hours,
|
||||
self.minutes,
|
||||
self.seconds,
|
||||
self.microseconds,
|
||||
self.leapdays,
|
||||
self.year,
|
||||
self.month,
|
||||
self.day,
|
||||
self.hour,
|
||||
self.minute,
|
||||
self.second,
|
||||
self.microsecond,
|
||||
))
|
||||
|
||||
def __ne__(self, other):
|
||||
return not self.__eq__(other)
|
||||
|
||||
def __div__(self, other):
|
||||
try:
|
||||
reciprocal = 1 / float(other)
|
||||
except TypeError:
|
||||
return NotImplemented
|
||||
|
||||
return self.__mul__(reciprocal)
|
||||
|
||||
__truediv__ = __div__
|
||||
|
||||
def __repr__(self):
|
||||
l = []
|
||||
for attr in ["years", "months", "days", "leapdays",
|
||||
"hours", "minutes", "seconds", "microseconds"]:
|
||||
value = getattr(self, attr)
|
||||
if value:
|
||||
l.append("{attr}={value:+g}".format(attr=attr, value=value))
|
||||
for attr in ["year", "month", "day", "weekday",
|
||||
"hour", "minute", "second", "microsecond"]:
|
||||
value = getattr(self, attr)
|
||||
if value is not None:
|
||||
l.append("{attr}={value}".format(attr=attr, value=repr(value)))
|
||||
return "{classname}({attrs})".format(classname=self.__class__.__name__,
|
||||
attrs=", ".join(l))
|
||||
|
||||
|
||||
def _sign(x):
|
||||
return int(copysign(1, x))
|
||||
|
||||
# vim:ts=4:sw=4:et
|
||||
1737
venv/lib/python3.11/site-packages/dateutil/rrule.py
Normal file
1737
venv/lib/python3.11/site-packages/dateutil/rrule.py
Normal file
File diff suppressed because it is too large
Load Diff
12
venv/lib/python3.11/site-packages/dateutil/tz/__init__.py
Normal file
12
venv/lib/python3.11/site-packages/dateutil/tz/__init__.py
Normal file
@ -0,0 +1,12 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from .tz import *
|
||||
from .tz import __doc__
|
||||
|
||||
__all__ = ["tzutc", "tzoffset", "tzlocal", "tzfile", "tzrange",
|
||||
"tzstr", "tzical", "tzwin", "tzwinlocal", "gettz",
|
||||
"enfold", "datetime_ambiguous", "datetime_exists",
|
||||
"resolve_imaginary", "UTC", "DeprecatedTzFormatWarning"]
|
||||
|
||||
|
||||
class DeprecatedTzFormatWarning(Warning):
|
||||
"""Warning raised when time zones are parsed from deprecated formats."""
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
419
venv/lib/python3.11/site-packages/dateutil/tz/_common.py
Normal file
419
venv/lib/python3.11/site-packages/dateutil/tz/_common.py
Normal file
@ -0,0 +1,419 @@
|
||||
from six import PY2
|
||||
|
||||
from functools import wraps
|
||||
|
||||
from datetime import datetime, timedelta, tzinfo
|
||||
|
||||
|
||||
ZERO = timedelta(0)
|
||||
|
||||
__all__ = ['tzname_in_python2', 'enfold']
|
||||
|
||||
|
||||
def tzname_in_python2(namefunc):
|
||||
"""Change unicode output into bytestrings in Python 2
|
||||
|
||||
tzname() API changed in Python 3. It used to return bytes, but was changed
|
||||
to unicode strings
|
||||
"""
|
||||
if PY2:
|
||||
@wraps(namefunc)
|
||||
def adjust_encoding(*args, **kwargs):
|
||||
name = namefunc(*args, **kwargs)
|
||||
if name is not None:
|
||||
name = name.encode()
|
||||
|
||||
return name
|
||||
|
||||
return adjust_encoding
|
||||
else:
|
||||
return namefunc
|
||||
|
||||
|
||||
# The following is adapted from Alexander Belopolsky's tz library
|
||||
# https://github.com/abalkin/tz
|
||||
if hasattr(datetime, 'fold'):
|
||||
# This is the pre-python 3.6 fold situation
|
||||
def enfold(dt, fold=1):
|
||||
"""
|
||||
Provides a unified interface for assigning the ``fold`` attribute to
|
||||
datetimes both before and after the implementation of PEP-495.
|
||||
|
||||
:param fold:
|
||||
The value for the ``fold`` attribute in the returned datetime. This
|
||||
should be either 0 or 1.
|
||||
|
||||
:return:
|
||||
Returns an object for which ``getattr(dt, 'fold', 0)`` returns
|
||||
``fold`` for all versions of Python. In versions prior to
|
||||
Python 3.6, this is a ``_DatetimeWithFold`` object, which is a
|
||||
subclass of :py:class:`datetime.datetime` with the ``fold``
|
||||
attribute added, if ``fold`` is 1.
|
||||
|
||||
.. versionadded:: 2.6.0
|
||||
"""
|
||||
return dt.replace(fold=fold)
|
||||
|
||||
else:
|
||||
class _DatetimeWithFold(datetime):
|
||||
"""
|
||||
This is a class designed to provide a PEP 495-compliant interface for
|
||||
Python versions before 3.6. It is used only for dates in a fold, so
|
||||
the ``fold`` attribute is fixed at ``1``.
|
||||
|
||||
.. versionadded:: 2.6.0
|
||||
"""
|
||||
__slots__ = ()
|
||||
|
||||
def replace(self, *args, **kwargs):
|
||||
"""
|
||||
Return a datetime with the same attributes, except for those
|
||||
attributes given new values by whichever keyword arguments are
|
||||
specified. Note that tzinfo=None can be specified to create a naive
|
||||
datetime from an aware datetime with no conversion of date and time
|
||||
data.
|
||||
|
||||
This is reimplemented in ``_DatetimeWithFold`` because pypy3 will
|
||||
return a ``datetime.datetime`` even if ``fold`` is unchanged.
|
||||
"""
|
||||
argnames = (
|
||||
'year', 'month', 'day', 'hour', 'minute', 'second',
|
||||
'microsecond', 'tzinfo'
|
||||
)
|
||||
|
||||
for arg, argname in zip(args, argnames):
|
||||
if argname in kwargs:
|
||||
raise TypeError('Duplicate argument: {}'.format(argname))
|
||||
|
||||
kwargs[argname] = arg
|
||||
|
||||
for argname in argnames:
|
||||
if argname not in kwargs:
|
||||
kwargs[argname] = getattr(self, argname)
|
||||
|
||||
dt_class = self.__class__ if kwargs.get('fold', 1) else datetime
|
||||
|
||||
return dt_class(**kwargs)
|
||||
|
||||
@property
|
||||
def fold(self):
|
||||
return 1
|
||||
|
||||
def enfold(dt, fold=1):
|
||||
"""
|
||||
Provides a unified interface for assigning the ``fold`` attribute to
|
||||
datetimes both before and after the implementation of PEP-495.
|
||||
|
||||
:param fold:
|
||||
The value for the ``fold`` attribute in the returned datetime. This
|
||||
should be either 0 or 1.
|
||||
|
||||
:return:
|
||||
Returns an object for which ``getattr(dt, 'fold', 0)`` returns
|
||||
``fold`` for all versions of Python. In versions prior to
|
||||
Python 3.6, this is a ``_DatetimeWithFold`` object, which is a
|
||||
subclass of :py:class:`datetime.datetime` with the ``fold``
|
||||
attribute added, if ``fold`` is 1.
|
||||
|
||||
.. versionadded:: 2.6.0
|
||||
"""
|
||||
if getattr(dt, 'fold', 0) == fold:
|
||||
return dt
|
||||
|
||||
args = dt.timetuple()[:6]
|
||||
args += (dt.microsecond, dt.tzinfo)
|
||||
|
||||
if fold:
|
||||
return _DatetimeWithFold(*args)
|
||||
else:
|
||||
return datetime(*args)
|
||||
|
||||
|
||||
def _validate_fromutc_inputs(f):
|
||||
"""
|
||||
The CPython version of ``fromutc`` checks that the input is a ``datetime``
|
||||
object and that ``self`` is attached as its ``tzinfo``.
|
||||
"""
|
||||
@wraps(f)
|
||||
def fromutc(self, dt):
|
||||
if not isinstance(dt, datetime):
|
||||
raise TypeError("fromutc() requires a datetime argument")
|
||||
if dt.tzinfo is not self:
|
||||
raise ValueError("dt.tzinfo is not self")
|
||||
|
||||
return f(self, dt)
|
||||
|
||||
return fromutc
|
||||
|
||||
|
||||
class _tzinfo(tzinfo):
|
||||
"""
|
||||
Base class for all ``dateutil`` ``tzinfo`` objects.
|
||||
"""
|
||||
|
||||
def is_ambiguous(self, dt):
|
||||
"""
|
||||
Whether or not the "wall time" of a given datetime is ambiguous in this
|
||||
zone.
|
||||
|
||||
:param dt:
|
||||
A :py:class:`datetime.datetime`, naive or time zone aware.
|
||||
|
||||
|
||||
:return:
|
||||
Returns ``True`` if ambiguous, ``False`` otherwise.
|
||||
|
||||
.. versionadded:: 2.6.0
|
||||
"""
|
||||
|
||||
dt = dt.replace(tzinfo=self)
|
||||
|
||||
wall_0 = enfold(dt, fold=0)
|
||||
wall_1 = enfold(dt, fold=1)
|
||||
|
||||
same_offset = wall_0.utcoffset() == wall_1.utcoffset()
|
||||
same_dt = wall_0.replace(tzinfo=None) == wall_1.replace(tzinfo=None)
|
||||
|
||||
return same_dt and not same_offset
|
||||
|
||||
def _fold_status(self, dt_utc, dt_wall):
|
||||
"""
|
||||
Determine the fold status of a "wall" datetime, given a representation
|
||||
of the same datetime as a (naive) UTC datetime. This is calculated based
|
||||
on the assumption that ``dt.utcoffset() - dt.dst()`` is constant for all
|
||||
datetimes, and that this offset is the actual number of hours separating
|
||||
``dt_utc`` and ``dt_wall``.
|
||||
|
||||
:param dt_utc:
|
||||
Representation of the datetime as UTC
|
||||
|
||||
:param dt_wall:
|
||||
Representation of the datetime as "wall time". This parameter must
|
||||
either have a `fold` attribute or have a fold-naive
|
||||
:class:`datetime.tzinfo` attached, otherwise the calculation may
|
||||
fail.
|
||||
"""
|
||||
if self.is_ambiguous(dt_wall):
|
||||
delta_wall = dt_wall - dt_utc
|
||||
_fold = int(delta_wall == (dt_utc.utcoffset() - dt_utc.dst()))
|
||||
else:
|
||||
_fold = 0
|
||||
|
||||
return _fold
|
||||
|
||||
def _fold(self, dt):
|
||||
return getattr(dt, 'fold', 0)
|
||||
|
||||
def _fromutc(self, dt):
|
||||
"""
|
||||
Given a timezone-aware datetime in a given timezone, calculates a
|
||||
timezone-aware datetime in a new timezone.
|
||||
|
||||
Since this is the one time that we *know* we have an unambiguous
|
||||
datetime object, we take this opportunity to determine whether the
|
||||
datetime is ambiguous and in a "fold" state (e.g. if it's the first
|
||||
occurrence, chronologically, of the ambiguous datetime).
|
||||
|
||||
:param dt:
|
||||
A timezone-aware :class:`datetime.datetime` object.
|
||||
"""
|
||||
|
||||
# Re-implement the algorithm from Python's datetime.py
|
||||
dtoff = dt.utcoffset()
|
||||
if dtoff is None:
|
||||
raise ValueError("fromutc() requires a non-None utcoffset() "
|
||||
"result")
|
||||
|
||||
# The original datetime.py code assumes that `dst()` defaults to
|
||||
# zero during ambiguous times. PEP 495 inverts this presumption, so
|
||||
# for pre-PEP 495 versions of python, we need to tweak the algorithm.
|
||||
dtdst = dt.dst()
|
||||
if dtdst is None:
|
||||
raise ValueError("fromutc() requires a non-None dst() result")
|
||||
delta = dtoff - dtdst
|
||||
|
||||
dt += delta
|
||||
# Set fold=1 so we can default to being in the fold for
|
||||
# ambiguous dates.
|
||||
dtdst = enfold(dt, fold=1).dst()
|
||||
if dtdst is None:
|
||||
raise ValueError("fromutc(): dt.dst gave inconsistent "
|
||||
"results; cannot convert")
|
||||
return dt + dtdst
|
||||
|
||||
@_validate_fromutc_inputs
|
||||
def fromutc(self, dt):
|
||||
"""
|
||||
Given a timezone-aware datetime in a given timezone, calculates a
|
||||
timezone-aware datetime in a new timezone.
|
||||
|
||||
Since this is the one time that we *know* we have an unambiguous
|
||||
datetime object, we take this opportunity to determine whether the
|
||||
datetime is ambiguous and in a "fold" state (e.g. if it's the first
|
||||
occurrence, chronologically, of the ambiguous datetime).
|
||||
|
||||
:param dt:
|
||||
A timezone-aware :class:`datetime.datetime` object.
|
||||
"""
|
||||
dt_wall = self._fromutc(dt)
|
||||
|
||||
# Calculate the fold status given the two datetimes.
|
||||
_fold = self._fold_status(dt, dt_wall)
|
||||
|
||||
# Set the default fold value for ambiguous dates
|
||||
return enfold(dt_wall, fold=_fold)
|
||||
|
||||
|
||||
class tzrangebase(_tzinfo):
|
||||
"""
|
||||
This is an abstract base class for time zones represented by an annual
|
||||
transition into and out of DST. Child classes should implement the following
|
||||
methods:
|
||||
|
||||
* ``__init__(self, *args, **kwargs)``
|
||||
* ``transitions(self, year)`` - this is expected to return a tuple of
|
||||
datetimes representing the DST on and off transitions in standard
|
||||
time.
|
||||
|
||||
A fully initialized ``tzrangebase`` subclass should also provide the
|
||||
following attributes:
|
||||
* ``hasdst``: Boolean whether or not the zone uses DST.
|
||||
* ``_dst_offset`` / ``_std_offset``: :class:`datetime.timedelta` objects
|
||||
representing the respective UTC offsets.
|
||||
* ``_dst_abbr`` / ``_std_abbr``: Strings representing the timezone short
|
||||
abbreviations in DST and STD, respectively.
|
||||
* ``_hasdst``: Whether or not the zone has DST.
|
||||
|
||||
.. versionadded:: 2.6.0
|
||||
"""
|
||||
def __init__(self):
|
||||
raise NotImplementedError('tzrangebase is an abstract base class')
|
||||
|
||||
def utcoffset(self, dt):
|
||||
isdst = self._isdst(dt)
|
||||
|
||||
if isdst is None:
|
||||
return None
|
||||
elif isdst:
|
||||
return self._dst_offset
|
||||
else:
|
||||
return self._std_offset
|
||||
|
||||
def dst(self, dt):
|
||||
isdst = self._isdst(dt)
|
||||
|
||||
if isdst is None:
|
||||
return None
|
||||
elif isdst:
|
||||
return self._dst_base_offset
|
||||
else:
|
||||
return ZERO
|
||||
|
||||
@tzname_in_python2
|
||||
def tzname(self, dt):
|
||||
if self._isdst(dt):
|
||||
return self._dst_abbr
|
||||
else:
|
||||
return self._std_abbr
|
||||
|
||||
def fromutc(self, dt):
|
||||
""" Given a datetime in UTC, return local time """
|
||||
if not isinstance(dt, datetime):
|
||||
raise TypeError("fromutc() requires a datetime argument")
|
||||
|
||||
if dt.tzinfo is not self:
|
||||
raise ValueError("dt.tzinfo is not self")
|
||||
|
||||
# Get transitions - if there are none, fixed offset
|
||||
transitions = self.transitions(dt.year)
|
||||
if transitions is None:
|
||||
return dt + self.utcoffset(dt)
|
||||
|
||||
# Get the transition times in UTC
|
||||
dston, dstoff = transitions
|
||||
|
||||
dston -= self._std_offset
|
||||
dstoff -= self._std_offset
|
||||
|
||||
utc_transitions = (dston, dstoff)
|
||||
dt_utc = dt.replace(tzinfo=None)
|
||||
|
||||
isdst = self._naive_isdst(dt_utc, utc_transitions)
|
||||
|
||||
if isdst:
|
||||
dt_wall = dt + self._dst_offset
|
||||
else:
|
||||
dt_wall = dt + self._std_offset
|
||||
|
||||
_fold = int(not isdst and self.is_ambiguous(dt_wall))
|
||||
|
||||
return enfold(dt_wall, fold=_fold)
|
||||
|
||||
def is_ambiguous(self, dt):
|
||||
"""
|
||||
Whether or not the "wall time" of a given datetime is ambiguous in this
|
||||
zone.
|
||||
|
||||
:param dt:
|
||||
A :py:class:`datetime.datetime`, naive or time zone aware.
|
||||
|
||||
|
||||
:return:
|
||||
Returns ``True`` if ambiguous, ``False`` otherwise.
|
||||
|
||||
.. versionadded:: 2.6.0
|
||||
"""
|
||||
if not self.hasdst:
|
||||
return False
|
||||
|
||||
start, end = self.transitions(dt.year)
|
||||
|
||||
dt = dt.replace(tzinfo=None)
|
||||
return (end <= dt < end + self._dst_base_offset)
|
||||
|
||||
def _isdst(self, dt):
|
||||
if not self.hasdst:
|
||||
return False
|
||||
elif dt is None:
|
||||
return None
|
||||
|
||||
transitions = self.transitions(dt.year)
|
||||
|
||||
if transitions is None:
|
||||
return False
|
||||
|
||||
dt = dt.replace(tzinfo=None)
|
||||
|
||||
isdst = self._naive_isdst(dt, transitions)
|
||||
|
||||
# Handle ambiguous dates
|
||||
if not isdst and self.is_ambiguous(dt):
|
||||
return not self._fold(dt)
|
||||
else:
|
||||
return isdst
|
||||
|
||||
def _naive_isdst(self, dt, transitions):
|
||||
dston, dstoff = transitions
|
||||
|
||||
dt = dt.replace(tzinfo=None)
|
||||
|
||||
if dston < dstoff:
|
||||
isdst = dston <= dt < dstoff
|
||||
else:
|
||||
isdst = not dstoff <= dt < dston
|
||||
|
||||
return isdst
|
||||
|
||||
@property
|
||||
def _dst_base_offset(self):
|
||||
return self._dst_offset - self._std_offset
|
||||
|
||||
__hash__ = None
|
||||
|
||||
def __ne__(self, other):
|
||||
return not (self == other)
|
||||
|
||||
def __repr__(self):
|
||||
return "%s(...)" % self.__class__.__name__
|
||||
|
||||
__reduce__ = object.__reduce__
|
||||
80
venv/lib/python3.11/site-packages/dateutil/tz/_factories.py
Normal file
80
venv/lib/python3.11/site-packages/dateutil/tz/_factories.py
Normal file
@ -0,0 +1,80 @@
|
||||
from datetime import timedelta
|
||||
import weakref
|
||||
from collections import OrderedDict
|
||||
|
||||
from six.moves import _thread
|
||||
|
||||
|
||||
class _TzSingleton(type):
|
||||
def __init__(cls, *args, **kwargs):
|
||||
cls.__instance = None
|
||||
super(_TzSingleton, cls).__init__(*args, **kwargs)
|
||||
|
||||
def __call__(cls):
|
||||
if cls.__instance is None:
|
||||
cls.__instance = super(_TzSingleton, cls).__call__()
|
||||
return cls.__instance
|
||||
|
||||
|
||||
class _TzFactory(type):
|
||||
def instance(cls, *args, **kwargs):
|
||||
"""Alternate constructor that returns a fresh instance"""
|
||||
return type.__call__(cls, *args, **kwargs)
|
||||
|
||||
|
||||
class _TzOffsetFactory(_TzFactory):
|
||||
def __init__(cls, *args, **kwargs):
|
||||
cls.__instances = weakref.WeakValueDictionary()
|
||||
cls.__strong_cache = OrderedDict()
|
||||
cls.__strong_cache_size = 8
|
||||
|
||||
cls._cache_lock = _thread.allocate_lock()
|
||||
|
||||
def __call__(cls, name, offset):
|
||||
if isinstance(offset, timedelta):
|
||||
key = (name, offset.total_seconds())
|
||||
else:
|
||||
key = (name, offset)
|
||||
|
||||
instance = cls.__instances.get(key, None)
|
||||
if instance is None:
|
||||
instance = cls.__instances.setdefault(key,
|
||||
cls.instance(name, offset))
|
||||
|
||||
# This lock may not be necessary in Python 3. See GH issue #901
|
||||
with cls._cache_lock:
|
||||
cls.__strong_cache[key] = cls.__strong_cache.pop(key, instance)
|
||||
|
||||
# Remove an item if the strong cache is overpopulated
|
||||
if len(cls.__strong_cache) > cls.__strong_cache_size:
|
||||
cls.__strong_cache.popitem(last=False)
|
||||
|
||||
return instance
|
||||
|
||||
|
||||
class _TzStrFactory(_TzFactory):
|
||||
def __init__(cls, *args, **kwargs):
|
||||
cls.__instances = weakref.WeakValueDictionary()
|
||||
cls.__strong_cache = OrderedDict()
|
||||
cls.__strong_cache_size = 8
|
||||
|
||||
cls.__cache_lock = _thread.allocate_lock()
|
||||
|
||||
def __call__(cls, s, posix_offset=False):
|
||||
key = (s, posix_offset)
|
||||
instance = cls.__instances.get(key, None)
|
||||
|
||||
if instance is None:
|
||||
instance = cls.__instances.setdefault(key,
|
||||
cls.instance(s, posix_offset))
|
||||
|
||||
# This lock may not be necessary in Python 3. See GH issue #901
|
||||
with cls.__cache_lock:
|
||||
cls.__strong_cache[key] = cls.__strong_cache.pop(key, instance)
|
||||
|
||||
# Remove an item if the strong cache is overpopulated
|
||||
if len(cls.__strong_cache) > cls.__strong_cache_size:
|
||||
cls.__strong_cache.popitem(last=False)
|
||||
|
||||
return instance
|
||||
|
||||
1849
venv/lib/python3.11/site-packages/dateutil/tz/tz.py
Normal file
1849
venv/lib/python3.11/site-packages/dateutil/tz/tz.py
Normal file
File diff suppressed because it is too large
Load Diff
370
venv/lib/python3.11/site-packages/dateutil/tz/win.py
Normal file
370
venv/lib/python3.11/site-packages/dateutil/tz/win.py
Normal file
@ -0,0 +1,370 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
This module provides an interface to the native time zone data on Windows,
|
||||
including :py:class:`datetime.tzinfo` implementations.
|
||||
|
||||
Attempting to import this module on a non-Windows platform will raise an
|
||||
:py:obj:`ImportError`.
|
||||
"""
|
||||
# This code was originally contributed by Jeffrey Harris.
|
||||
import datetime
|
||||
import struct
|
||||
|
||||
from six.moves import winreg
|
||||
from six import text_type
|
||||
|
||||
try:
|
||||
import ctypes
|
||||
from ctypes import wintypes
|
||||
except ValueError:
|
||||
# ValueError is raised on non-Windows systems for some horrible reason.
|
||||
raise ImportError("Running tzwin on non-Windows system")
|
||||
|
||||
from ._common import tzrangebase
|
||||
|
||||
__all__ = ["tzwin", "tzwinlocal", "tzres"]
|
||||
|
||||
ONEWEEK = datetime.timedelta(7)
|
||||
|
||||
TZKEYNAMENT = r"SOFTWARE\Microsoft\Windows NT\CurrentVersion\Time Zones"
|
||||
TZKEYNAME9X = r"SOFTWARE\Microsoft\Windows\CurrentVersion\Time Zones"
|
||||
TZLOCALKEYNAME = r"SYSTEM\CurrentControlSet\Control\TimeZoneInformation"
|
||||
|
||||
|
||||
def _settzkeyname():
|
||||
handle = winreg.ConnectRegistry(None, winreg.HKEY_LOCAL_MACHINE)
|
||||
try:
|
||||
winreg.OpenKey(handle, TZKEYNAMENT).Close()
|
||||
TZKEYNAME = TZKEYNAMENT
|
||||
except WindowsError:
|
||||
TZKEYNAME = TZKEYNAME9X
|
||||
handle.Close()
|
||||
return TZKEYNAME
|
||||
|
||||
|
||||
TZKEYNAME = _settzkeyname()
|
||||
|
||||
|
||||
class tzres(object):
|
||||
"""
|
||||
Class for accessing ``tzres.dll``, which contains timezone name related
|
||||
resources.
|
||||
|
||||
.. versionadded:: 2.5.0
|
||||
"""
|
||||
p_wchar = ctypes.POINTER(wintypes.WCHAR) # Pointer to a wide char
|
||||
|
||||
def __init__(self, tzres_loc='tzres.dll'):
|
||||
# Load the user32 DLL so we can load strings from tzres
|
||||
user32 = ctypes.WinDLL('user32')
|
||||
|
||||
# Specify the LoadStringW function
|
||||
user32.LoadStringW.argtypes = (wintypes.HINSTANCE,
|
||||
wintypes.UINT,
|
||||
wintypes.LPWSTR,
|
||||
ctypes.c_int)
|
||||
|
||||
self.LoadStringW = user32.LoadStringW
|
||||
self._tzres = ctypes.WinDLL(tzres_loc)
|
||||
self.tzres_loc = tzres_loc
|
||||
|
||||
def load_name(self, offset):
|
||||
"""
|
||||
Load a timezone name from a DLL offset (integer).
|
||||
|
||||
>>> from dateutil.tzwin import tzres
|
||||
>>> tzr = tzres()
|
||||
>>> print(tzr.load_name(112))
|
||||
'Eastern Standard Time'
|
||||
|
||||
:param offset:
|
||||
A positive integer value referring to a string from the tzres dll.
|
||||
|
||||
.. note::
|
||||
|
||||
Offsets found in the registry are generally of the form
|
||||
``@tzres.dll,-114``. The offset in this case is 114, not -114.
|
||||
|
||||
"""
|
||||
resource = self.p_wchar()
|
||||
lpBuffer = ctypes.cast(ctypes.byref(resource), wintypes.LPWSTR)
|
||||
nchar = self.LoadStringW(self._tzres._handle, offset, lpBuffer, 0)
|
||||
return resource[:nchar]
|
||||
|
||||
def name_from_string(self, tzname_str):
|
||||
"""
|
||||
Parse strings as returned from the Windows registry into the time zone
|
||||
name as defined in the registry.
|
||||
|
||||
>>> from dateutil.tzwin import tzres
|
||||
>>> tzr = tzres()
|
||||
>>> print(tzr.name_from_string('@tzres.dll,-251'))
|
||||
'Dateline Daylight Time'
|
||||
>>> print(tzr.name_from_string('Eastern Standard Time'))
|
||||
'Eastern Standard Time'
|
||||
|
||||
:param tzname_str:
|
||||
A timezone name string as returned from a Windows registry key.
|
||||
|
||||
:return:
|
||||
Returns the localized timezone string from tzres.dll if the string
|
||||
is of the form `@tzres.dll,-offset`, else returns the input string.
|
||||
"""
|
||||
if not tzname_str.startswith('@'):
|
||||
return tzname_str
|
||||
|
||||
name_splt = tzname_str.split(',-')
|
||||
try:
|
||||
offset = int(name_splt[1])
|
||||
except:
|
||||
raise ValueError("Malformed timezone string.")
|
||||
|
||||
return self.load_name(offset)
|
||||
|
||||
|
||||
class tzwinbase(tzrangebase):
|
||||
"""tzinfo class based on win32's timezones available in the registry."""
|
||||
def __init__(self):
|
||||
raise NotImplementedError('tzwinbase is an abstract base class')
|
||||
|
||||
def __eq__(self, other):
|
||||
# Compare on all relevant dimensions, including name.
|
||||
if not isinstance(other, tzwinbase):
|
||||
return NotImplemented
|
||||
|
||||
return (self._std_offset == other._std_offset and
|
||||
self._dst_offset == other._dst_offset and
|
||||
self._stddayofweek == other._stddayofweek and
|
||||
self._dstdayofweek == other._dstdayofweek and
|
||||
self._stdweeknumber == other._stdweeknumber and
|
||||
self._dstweeknumber == other._dstweeknumber and
|
||||
self._stdhour == other._stdhour and
|
||||
self._dsthour == other._dsthour and
|
||||
self._stdminute == other._stdminute and
|
||||
self._dstminute == other._dstminute and
|
||||
self._std_abbr == other._std_abbr and
|
||||
self._dst_abbr == other._dst_abbr)
|
||||
|
||||
@staticmethod
|
||||
def list():
|
||||
"""Return a list of all time zones known to the system."""
|
||||
with winreg.ConnectRegistry(None, winreg.HKEY_LOCAL_MACHINE) as handle:
|
||||
with winreg.OpenKey(handle, TZKEYNAME) as tzkey:
|
||||
result = [winreg.EnumKey(tzkey, i)
|
||||
for i in range(winreg.QueryInfoKey(tzkey)[0])]
|
||||
return result
|
||||
|
||||
def display(self):
|
||||
"""
|
||||
Return the display name of the time zone.
|
||||
"""
|
||||
return self._display
|
||||
|
||||
def transitions(self, year):
|
||||
"""
|
||||
For a given year, get the DST on and off transition times, expressed
|
||||
always on the standard time side. For zones with no transitions, this
|
||||
function returns ``None``.
|
||||
|
||||
:param year:
|
||||
The year whose transitions you would like to query.
|
||||
|
||||
:return:
|
||||
Returns a :class:`tuple` of :class:`datetime.datetime` objects,
|
||||
``(dston, dstoff)`` for zones with an annual DST transition, or
|
||||
``None`` for fixed offset zones.
|
||||
"""
|
||||
|
||||
if not self.hasdst:
|
||||
return None
|
||||
|
||||
dston = picknthweekday(year, self._dstmonth, self._dstdayofweek,
|
||||
self._dsthour, self._dstminute,
|
||||
self._dstweeknumber)
|
||||
|
||||
dstoff = picknthweekday(year, self._stdmonth, self._stddayofweek,
|
||||
self._stdhour, self._stdminute,
|
||||
self._stdweeknumber)
|
||||
|
||||
# Ambiguous dates default to the STD side
|
||||
dstoff -= self._dst_base_offset
|
||||
|
||||
return dston, dstoff
|
||||
|
||||
def _get_hasdst(self):
|
||||
return self._dstmonth != 0
|
||||
|
||||
@property
|
||||
def _dst_base_offset(self):
|
||||
return self._dst_base_offset_
|
||||
|
||||
|
||||
class tzwin(tzwinbase):
|
||||
"""
|
||||
Time zone object created from the zone info in the Windows registry
|
||||
|
||||
These are similar to :py:class:`dateutil.tz.tzrange` objects in that
|
||||
the time zone data is provided in the format of a single offset rule
|
||||
for either 0 or 2 time zone transitions per year.
|
||||
|
||||
:param: name
|
||||
The name of a Windows time zone key, e.g. "Eastern Standard Time".
|
||||
The full list of keys can be retrieved with :func:`tzwin.list`.
|
||||
"""
|
||||
|
||||
def __init__(self, name):
|
||||
self._name = name
|
||||
|
||||
with winreg.ConnectRegistry(None, winreg.HKEY_LOCAL_MACHINE) as handle:
|
||||
tzkeyname = text_type("{kn}\\{name}").format(kn=TZKEYNAME, name=name)
|
||||
with winreg.OpenKey(handle, tzkeyname) as tzkey:
|
||||
keydict = valuestodict(tzkey)
|
||||
|
||||
self._std_abbr = keydict["Std"]
|
||||
self._dst_abbr = keydict["Dlt"]
|
||||
|
||||
self._display = keydict["Display"]
|
||||
|
||||
# See http://ww_winreg.jsiinc.com/SUBA/tip0300/rh0398.htm
|
||||
tup = struct.unpack("=3l16h", keydict["TZI"])
|
||||
stdoffset = -tup[0]-tup[1] # Bias + StandardBias * -1
|
||||
dstoffset = stdoffset-tup[2] # + DaylightBias * -1
|
||||
self._std_offset = datetime.timedelta(minutes=stdoffset)
|
||||
self._dst_offset = datetime.timedelta(minutes=dstoffset)
|
||||
|
||||
# for the meaning see the win32 TIME_ZONE_INFORMATION structure docs
|
||||
# http://msdn.microsoft.com/en-us/library/windows/desktop/ms725481(v=vs.85).aspx
|
||||
(self._stdmonth,
|
||||
self._stddayofweek, # Sunday = 0
|
||||
self._stdweeknumber, # Last = 5
|
||||
self._stdhour,
|
||||
self._stdminute) = tup[4:9]
|
||||
|
||||
(self._dstmonth,
|
||||
self._dstdayofweek, # Sunday = 0
|
||||
self._dstweeknumber, # Last = 5
|
||||
self._dsthour,
|
||||
self._dstminute) = tup[12:17]
|
||||
|
||||
self._dst_base_offset_ = self._dst_offset - self._std_offset
|
||||
self.hasdst = self._get_hasdst()
|
||||
|
||||
def __repr__(self):
|
||||
return "tzwin(%s)" % repr(self._name)
|
||||
|
||||
def __reduce__(self):
|
||||
return (self.__class__, (self._name,))
|
||||
|
||||
|
||||
class tzwinlocal(tzwinbase):
|
||||
"""
|
||||
Class representing the local time zone information in the Windows registry
|
||||
|
||||
While :class:`dateutil.tz.tzlocal` makes system calls (via the :mod:`time`
|
||||
module) to retrieve time zone information, ``tzwinlocal`` retrieves the
|
||||
rules directly from the Windows registry and creates an object like
|
||||
:class:`dateutil.tz.tzwin`.
|
||||
|
||||
Because Windows does not have an equivalent of :func:`time.tzset`, on
|
||||
Windows, :class:`dateutil.tz.tzlocal` instances will always reflect the
|
||||
time zone settings *at the time that the process was started*, meaning
|
||||
changes to the machine's time zone settings during the run of a program
|
||||
on Windows will **not** be reflected by :class:`dateutil.tz.tzlocal`.
|
||||
Because ``tzwinlocal`` reads the registry directly, it is unaffected by
|
||||
this issue.
|
||||
"""
|
||||
def __init__(self):
|
||||
with winreg.ConnectRegistry(None, winreg.HKEY_LOCAL_MACHINE) as handle:
|
||||
with winreg.OpenKey(handle, TZLOCALKEYNAME) as tzlocalkey:
|
||||
keydict = valuestodict(tzlocalkey)
|
||||
|
||||
self._std_abbr = keydict["StandardName"]
|
||||
self._dst_abbr = keydict["DaylightName"]
|
||||
|
||||
try:
|
||||
tzkeyname = text_type('{kn}\\{sn}').format(kn=TZKEYNAME,
|
||||
sn=self._std_abbr)
|
||||
with winreg.OpenKey(handle, tzkeyname) as tzkey:
|
||||
_keydict = valuestodict(tzkey)
|
||||
self._display = _keydict["Display"]
|
||||
except OSError:
|
||||
self._display = None
|
||||
|
||||
stdoffset = -keydict["Bias"]-keydict["StandardBias"]
|
||||
dstoffset = stdoffset-keydict["DaylightBias"]
|
||||
|
||||
self._std_offset = datetime.timedelta(minutes=stdoffset)
|
||||
self._dst_offset = datetime.timedelta(minutes=dstoffset)
|
||||
|
||||
# For reasons unclear, in this particular key, the day of week has been
|
||||
# moved to the END of the SYSTEMTIME structure.
|
||||
tup = struct.unpack("=8h", keydict["StandardStart"])
|
||||
|
||||
(self._stdmonth,
|
||||
self._stdweeknumber, # Last = 5
|
||||
self._stdhour,
|
||||
self._stdminute) = tup[1:5]
|
||||
|
||||
self._stddayofweek = tup[7]
|
||||
|
||||
tup = struct.unpack("=8h", keydict["DaylightStart"])
|
||||
|
||||
(self._dstmonth,
|
||||
self._dstweeknumber, # Last = 5
|
||||
self._dsthour,
|
||||
self._dstminute) = tup[1:5]
|
||||
|
||||
self._dstdayofweek = tup[7]
|
||||
|
||||
self._dst_base_offset_ = self._dst_offset - self._std_offset
|
||||
self.hasdst = self._get_hasdst()
|
||||
|
||||
def __repr__(self):
|
||||
return "tzwinlocal()"
|
||||
|
||||
def __str__(self):
|
||||
# str will return the standard name, not the daylight name.
|
||||
return "tzwinlocal(%s)" % repr(self._std_abbr)
|
||||
|
||||
def __reduce__(self):
|
||||
return (self.__class__, ())
|
||||
|
||||
|
||||
def picknthweekday(year, month, dayofweek, hour, minute, whichweek):
|
||||
""" dayofweek == 0 means Sunday, whichweek 5 means last instance """
|
||||
first = datetime.datetime(year, month, 1, hour, minute)
|
||||
|
||||
# This will work if dayofweek is ISO weekday (1-7) or Microsoft-style (0-6),
|
||||
# Because 7 % 7 = 0
|
||||
weekdayone = first.replace(day=((dayofweek - first.isoweekday()) % 7) + 1)
|
||||
wd = weekdayone + ((whichweek - 1) * ONEWEEK)
|
||||
if (wd.month != month):
|
||||
wd -= ONEWEEK
|
||||
|
||||
return wd
|
||||
|
||||
|
||||
def valuestodict(key):
|
||||
"""Convert a registry key's values to a dictionary."""
|
||||
dout = {}
|
||||
size = winreg.QueryInfoKey(key)[1]
|
||||
tz_res = None
|
||||
|
||||
for i in range(size):
|
||||
key_name, value, dtype = winreg.EnumValue(key, i)
|
||||
if dtype == winreg.REG_DWORD or dtype == winreg.REG_DWORD_LITTLE_ENDIAN:
|
||||
# If it's a DWORD (32-bit integer), it's stored as unsigned - convert
|
||||
# that to a proper signed integer
|
||||
if value & (1 << 31):
|
||||
value = value - (1 << 32)
|
||||
elif dtype == winreg.REG_SZ:
|
||||
# If it's a reference to the tzres DLL, load the actual string
|
||||
if value.startswith('@tzres'):
|
||||
tz_res = tz_res or tzres()
|
||||
value = tz_res.name_from_string(value)
|
||||
|
||||
value = value.rstrip('\x00') # Remove trailing nulls
|
||||
|
||||
dout[key_name] = value
|
||||
|
||||
return dout
|
||||
2
venv/lib/python3.11/site-packages/dateutil/tzwin.py
Normal file
2
venv/lib/python3.11/site-packages/dateutil/tzwin.py
Normal file
@ -0,0 +1,2 @@
|
||||
# tzwin has moved to dateutil.tz.win
|
||||
from .tz.win import *
|
||||
71
venv/lib/python3.11/site-packages/dateutil/utils.py
Normal file
71
venv/lib/python3.11/site-packages/dateutil/utils.py
Normal file
@ -0,0 +1,71 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
This module offers general convenience and utility functions for dealing with
|
||||
datetimes.
|
||||
|
||||
.. versionadded:: 2.7.0
|
||||
"""
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from datetime import datetime, time
|
||||
|
||||
|
||||
def today(tzinfo=None):
|
||||
"""
|
||||
Returns a :py:class:`datetime` representing the current day at midnight
|
||||
|
||||
:param tzinfo:
|
||||
The time zone to attach (also used to determine the current day).
|
||||
|
||||
:return:
|
||||
A :py:class:`datetime.datetime` object representing the current day
|
||||
at midnight.
|
||||
"""
|
||||
|
||||
dt = datetime.now(tzinfo)
|
||||
return datetime.combine(dt.date(), time(0, tzinfo=tzinfo))
|
||||
|
||||
|
||||
def default_tzinfo(dt, tzinfo):
|
||||
"""
|
||||
Sets the ``tzinfo`` parameter on naive datetimes only
|
||||
|
||||
This is useful for example when you are provided a datetime that may have
|
||||
either an implicit or explicit time zone, such as when parsing a time zone
|
||||
string.
|
||||
|
||||
.. doctest::
|
||||
|
||||
>>> from dateutil.tz import tzoffset
|
||||
>>> from dateutil.parser import parse
|
||||
>>> from dateutil.utils import default_tzinfo
|
||||
>>> dflt_tz = tzoffset("EST", -18000)
|
||||
>>> print(default_tzinfo(parse('2014-01-01 12:30 UTC'), dflt_tz))
|
||||
2014-01-01 12:30:00+00:00
|
||||
>>> print(default_tzinfo(parse('2014-01-01 12:30'), dflt_tz))
|
||||
2014-01-01 12:30:00-05:00
|
||||
|
||||
:param dt:
|
||||
The datetime on which to replace the time zone
|
||||
|
||||
:param tzinfo:
|
||||
The :py:class:`datetime.tzinfo` subclass instance to assign to
|
||||
``dt`` if (and only if) it is naive.
|
||||
|
||||
:return:
|
||||
Returns an aware :py:class:`datetime.datetime`.
|
||||
"""
|
||||
if dt.tzinfo is not None:
|
||||
return dt
|
||||
else:
|
||||
return dt.replace(tzinfo=tzinfo)
|
||||
|
||||
|
||||
def within_delta(dt1, dt2, delta):
|
||||
"""
|
||||
Useful for comparing two datetimes that may have a negligible difference
|
||||
to be considered equal.
|
||||
"""
|
||||
delta = abs(delta)
|
||||
difference = dt1 - dt2
|
||||
return -delta <= difference <= delta
|
||||
167
venv/lib/python3.11/site-packages/dateutil/zoneinfo/__init__.py
Normal file
167
venv/lib/python3.11/site-packages/dateutil/zoneinfo/__init__.py
Normal file
@ -0,0 +1,167 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
import warnings
|
||||
import json
|
||||
|
||||
from tarfile import TarFile
|
||||
from pkgutil import get_data
|
||||
from io import BytesIO
|
||||
|
||||
from dateutil.tz import tzfile as _tzfile
|
||||
|
||||
__all__ = ["get_zonefile_instance", "gettz", "gettz_db_metadata"]
|
||||
|
||||
ZONEFILENAME = "dateutil-zoneinfo.tar.gz"
|
||||
METADATA_FN = 'METADATA'
|
||||
|
||||
|
||||
class tzfile(_tzfile):
|
||||
def __reduce__(self):
|
||||
return (gettz, (self._filename,))
|
||||
|
||||
|
||||
def getzoneinfofile_stream():
|
||||
try:
|
||||
return BytesIO(get_data(__name__, ZONEFILENAME))
|
||||
except IOError as e: # TODO switch to FileNotFoundError?
|
||||
warnings.warn("I/O error({0}): {1}".format(e.errno, e.strerror))
|
||||
return None
|
||||
|
||||
|
||||
class ZoneInfoFile(object):
|
||||
def __init__(self, zonefile_stream=None):
|
||||
if zonefile_stream is not None:
|
||||
with TarFile.open(fileobj=zonefile_stream) as tf:
|
||||
self.zones = {zf.name: tzfile(tf.extractfile(zf), filename=zf.name)
|
||||
for zf in tf.getmembers()
|
||||
if zf.isfile() and zf.name != METADATA_FN}
|
||||
# deal with links: They'll point to their parent object. Less
|
||||
# waste of memory
|
||||
links = {zl.name: self.zones[zl.linkname]
|
||||
for zl in tf.getmembers() if
|
||||
zl.islnk() or zl.issym()}
|
||||
self.zones.update(links)
|
||||
try:
|
||||
metadata_json = tf.extractfile(tf.getmember(METADATA_FN))
|
||||
metadata_str = metadata_json.read().decode('UTF-8')
|
||||
self.metadata = json.loads(metadata_str)
|
||||
except KeyError:
|
||||
# no metadata in tar file
|
||||
self.metadata = None
|
||||
else:
|
||||
self.zones = {}
|
||||
self.metadata = None
|
||||
|
||||
def get(self, name, default=None):
|
||||
"""
|
||||
Wrapper for :func:`ZoneInfoFile.zones.get`. This is a convenience method
|
||||
for retrieving zones from the zone dictionary.
|
||||
|
||||
:param name:
|
||||
The name of the zone to retrieve. (Generally IANA zone names)
|
||||
|
||||
:param default:
|
||||
The value to return in the event of a missing key.
|
||||
|
||||
.. versionadded:: 2.6.0
|
||||
|
||||
"""
|
||||
return self.zones.get(name, default)
|
||||
|
||||
|
||||
# The current API has gettz as a module function, although in fact it taps into
|
||||
# a stateful class. So as a workaround for now, without changing the API, we
|
||||
# will create a new "global" class instance the first time a user requests a
|
||||
# timezone. Ugly, but adheres to the api.
|
||||
#
|
||||
# TODO: Remove after deprecation period.
|
||||
_CLASS_ZONE_INSTANCE = []
|
||||
|
||||
|
||||
def get_zonefile_instance(new_instance=False):
|
||||
"""
|
||||
This is a convenience function which provides a :class:`ZoneInfoFile`
|
||||
instance using the data provided by the ``dateutil`` package. By default, it
|
||||
caches a single instance of the ZoneInfoFile object and returns that.
|
||||
|
||||
:param new_instance:
|
||||
If ``True``, a new instance of :class:`ZoneInfoFile` is instantiated and
|
||||
used as the cached instance for the next call. Otherwise, new instances
|
||||
are created only as necessary.
|
||||
|
||||
:return:
|
||||
Returns a :class:`ZoneInfoFile` object.
|
||||
|
||||
.. versionadded:: 2.6
|
||||
"""
|
||||
if new_instance:
|
||||
zif = None
|
||||
else:
|
||||
zif = getattr(get_zonefile_instance, '_cached_instance', None)
|
||||
|
||||
if zif is None:
|
||||
zif = ZoneInfoFile(getzoneinfofile_stream())
|
||||
|
||||
get_zonefile_instance._cached_instance = zif
|
||||
|
||||
return zif
|
||||
|
||||
|
||||
def gettz(name):
|
||||
"""
|
||||
This retrieves a time zone from the local zoneinfo tarball that is packaged
|
||||
with dateutil.
|
||||
|
||||
:param name:
|
||||
An IANA-style time zone name, as found in the zoneinfo file.
|
||||
|
||||
:return:
|
||||
Returns a :class:`dateutil.tz.tzfile` time zone object.
|
||||
|
||||
.. warning::
|
||||
It is generally inadvisable to use this function, and it is only
|
||||
provided for API compatibility with earlier versions. This is *not*
|
||||
equivalent to ``dateutil.tz.gettz()``, which selects an appropriate
|
||||
time zone based on the inputs, favoring system zoneinfo. This is ONLY
|
||||
for accessing the dateutil-specific zoneinfo (which may be out of
|
||||
date compared to the system zoneinfo).
|
||||
|
||||
.. deprecated:: 2.6
|
||||
If you need to use a specific zoneinfofile over the system zoneinfo,
|
||||
instantiate a :class:`dateutil.zoneinfo.ZoneInfoFile` object and call
|
||||
:func:`dateutil.zoneinfo.ZoneInfoFile.get(name)` instead.
|
||||
|
||||
Use :func:`get_zonefile_instance` to retrieve an instance of the
|
||||
dateutil-provided zoneinfo.
|
||||
"""
|
||||
warnings.warn("zoneinfo.gettz() will be removed in future versions, "
|
||||
"to use the dateutil-provided zoneinfo files, instantiate a "
|
||||
"ZoneInfoFile object and use ZoneInfoFile.zones.get() "
|
||||
"instead. See the documentation for details.",
|
||||
DeprecationWarning)
|
||||
|
||||
if len(_CLASS_ZONE_INSTANCE) == 0:
|
||||
_CLASS_ZONE_INSTANCE.append(ZoneInfoFile(getzoneinfofile_stream()))
|
||||
return _CLASS_ZONE_INSTANCE[0].zones.get(name)
|
||||
|
||||
|
||||
def gettz_db_metadata():
|
||||
""" Get the zonefile metadata
|
||||
|
||||
See `zonefile_metadata`_
|
||||
|
||||
:returns:
|
||||
A dictionary with the database metadata
|
||||
|
||||
.. deprecated:: 2.6
|
||||
See deprecation warning in :func:`zoneinfo.gettz`. To get metadata,
|
||||
query the attribute ``zoneinfo.ZoneInfoFile.metadata``.
|
||||
"""
|
||||
warnings.warn("zoneinfo.gettz_db_metadata() will be removed in future "
|
||||
"versions, to use the dateutil-provided zoneinfo files, "
|
||||
"ZoneInfoFile object and query the 'metadata' attribute "
|
||||
"instead. See the documentation for details.",
|
||||
DeprecationWarning)
|
||||
|
||||
if len(_CLASS_ZONE_INSTANCE) == 0:
|
||||
_CLASS_ZONE_INSTANCE.append(ZoneInfoFile(getzoneinfofile_stream()))
|
||||
return _CLASS_ZONE_INSTANCE[0].metadata
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -0,0 +1,75 @@
|
||||
import logging
|
||||
import os
|
||||
import tempfile
|
||||
import shutil
|
||||
import json
|
||||
from subprocess import check_call, check_output
|
||||
from tarfile import TarFile
|
||||
|
||||
from dateutil.zoneinfo import METADATA_FN, ZONEFILENAME
|
||||
|
||||
|
||||
def rebuild(filename, tag=None, format="gz", zonegroups=[], metadata=None):
|
||||
"""Rebuild the internal timezone info in dateutil/zoneinfo/zoneinfo*tar*
|
||||
|
||||
filename is the timezone tarball from ``ftp.iana.org/tz``.
|
||||
|
||||
"""
|
||||
tmpdir = tempfile.mkdtemp()
|
||||
zonedir = os.path.join(tmpdir, "zoneinfo")
|
||||
moduledir = os.path.dirname(__file__)
|
||||
try:
|
||||
with TarFile.open(filename) as tf:
|
||||
for name in zonegroups:
|
||||
tf.extract(name, tmpdir)
|
||||
filepaths = [os.path.join(tmpdir, n) for n in zonegroups]
|
||||
|
||||
_run_zic(zonedir, filepaths)
|
||||
|
||||
# write metadata file
|
||||
with open(os.path.join(zonedir, METADATA_FN), 'w') as f:
|
||||
json.dump(metadata, f, indent=4, sort_keys=True)
|
||||
target = os.path.join(moduledir, ZONEFILENAME)
|
||||
with TarFile.open(target, "w:%s" % format) as tf:
|
||||
for entry in os.listdir(zonedir):
|
||||
entrypath = os.path.join(zonedir, entry)
|
||||
tf.add(entrypath, entry)
|
||||
finally:
|
||||
shutil.rmtree(tmpdir)
|
||||
|
||||
|
||||
def _run_zic(zonedir, filepaths):
|
||||
"""Calls the ``zic`` compiler in a compatible way to get a "fat" binary.
|
||||
|
||||
Recent versions of ``zic`` default to ``-b slim``, while older versions
|
||||
don't even have the ``-b`` option (but default to "fat" binaries). The
|
||||
current version of dateutil does not support Version 2+ TZif files, which
|
||||
causes problems when used in conjunction with "slim" binaries, so this
|
||||
function is used to ensure that we always get a "fat" binary.
|
||||
"""
|
||||
|
||||
try:
|
||||
help_text = check_output(["zic", "--help"])
|
||||
except OSError as e:
|
||||
_print_on_nosuchfile(e)
|
||||
raise
|
||||
|
||||
if b"-b " in help_text:
|
||||
bloat_args = ["-b", "fat"]
|
||||
else:
|
||||
bloat_args = []
|
||||
|
||||
check_call(["zic"] + bloat_args + ["-d", zonedir] + filepaths)
|
||||
|
||||
|
||||
def _print_on_nosuchfile(e):
|
||||
"""Print helpful troubleshooting message
|
||||
|
||||
e is an exception raised by subprocess.check_call()
|
||||
|
||||
"""
|
||||
if e.errno == 2:
|
||||
logging.error(
|
||||
"Could not find zic. Perhaps you need to install "
|
||||
"libc-bin or some other package that provides it, "
|
||||
"or it's not in your PATH?")
|
||||
@ -0,0 +1 @@
|
||||
pip
|
||||
@ -0,0 +1,201 @@
|
||||
Apache License
|
||||
Version 2.0, January 2004
|
||||
http://www.apache.org/licenses/
|
||||
|
||||
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
||||
|
||||
1. Definitions.
|
||||
|
||||
"License" shall mean the terms and conditions for use, reproduction,
|
||||
and distribution as defined by Sections 1 through 9 of this document.
|
||||
|
||||
"Licensor" shall mean the copyright owner or entity authorized by
|
||||
the copyright owner that is granting the License.
|
||||
|
||||
"Legal Entity" shall mean the union of the acting entity and all
|
||||
other entities that control, are controlled by, or are under common
|
||||
control with that entity. For the purposes of this definition,
|
||||
"control" means (i) the power, direct or indirect, to cause the
|
||||
direction or management of such entity, whether by contract or
|
||||
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
||||
outstanding shares, or (iii) beneficial ownership of such entity.
|
||||
|
||||
"You" (or "Your") shall mean an individual or Legal Entity
|
||||
exercising permissions granted by this License.
|
||||
|
||||
"Source" form shall mean the preferred form for making modifications,
|
||||
including but not limited to software source code, documentation
|
||||
source, and configuration files.
|
||||
|
||||
"Object" form shall mean any form resulting from mechanical
|
||||
transformation or translation of a Source form, including but
|
||||
not limited to compiled object code, generated documentation,
|
||||
and conversions to other media types.
|
||||
|
||||
"Work" shall mean the work of authorship, whether in Source or
|
||||
Object form, made available under the License, as indicated by a
|
||||
copyright notice that is included in or attached to the work
|
||||
(an example is provided in the Appendix below).
|
||||
|
||||
"Derivative Works" shall mean any work, whether in Source or Object
|
||||
form, that is based on (or derived from) the Work and for which the
|
||||
editorial revisions, annotations, elaborations, or other modifications
|
||||
represent, as a whole, an original work of authorship. For the purposes
|
||||
of this License, Derivative Works shall not include works that remain
|
||||
separable from, or merely link (or bind by name) to the interfaces of,
|
||||
the Work and Derivative Works thereof.
|
||||
|
||||
"Contribution" shall mean any work of authorship, including
|
||||
the original version of the Work and any modifications or additions
|
||||
to that Work or Derivative Works thereof, that is intentionally
|
||||
submitted to Licensor for inclusion in the Work by the copyright owner
|
||||
or by an individual or Legal Entity authorized to submit on behalf of
|
||||
the copyright owner. For the purposes of this definition, "submitted"
|
||||
means any form of electronic, verbal, or written communication sent
|
||||
to the Licensor or its representatives, including but not limited to
|
||||
communication on electronic mailing lists, source code control systems,
|
||||
and issue tracking systems that are managed by, or on behalf of, the
|
||||
Licensor for the purpose of discussing and improving the Work, but
|
||||
excluding communication that is conspicuously marked or otherwise
|
||||
designated in writing by the copyright owner as "Not a Contribution."
|
||||
|
||||
"Contributor" shall mean Licensor and any individual or Legal Entity
|
||||
on behalf of whom a Contribution has been received by Licensor and
|
||||
subsequently incorporated within the Work.
|
||||
|
||||
2. Grant of Copyright License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
copyright license to reproduce, prepare Derivative Works of,
|
||||
publicly display, publicly perform, sublicense, and distribute the
|
||||
Work and such Derivative Works in Source or Object form.
|
||||
|
||||
3. Grant of Patent License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
(except as stated in this section) patent license to make, have made,
|
||||
use, offer to sell, sell, import, and otherwise transfer the Work,
|
||||
where such license applies only to those patent claims licensable
|
||||
by such Contributor that are necessarily infringed by their
|
||||
Contribution(s) alone or by combination of their Contribution(s)
|
||||
with the Work to which such Contribution(s) was submitted. If You
|
||||
institute patent litigation against any entity (including a
|
||||
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
||||
or a Contribution incorporated within the Work constitutes direct
|
||||
or contributory patent infringement, then any patent licenses
|
||||
granted to You under this License for that Work shall terminate
|
||||
as of the date such litigation is filed.
|
||||
|
||||
4. Redistribution. You may reproduce and distribute copies of the
|
||||
Work or Derivative Works thereof in any medium, with or without
|
||||
modifications, and in Source or Object form, provided that You
|
||||
meet the following conditions:
|
||||
|
||||
(a) You must give any other recipients of the Work or
|
||||
Derivative Works a copy of this License; and
|
||||
|
||||
(b) You must cause any modified files to carry prominent notices
|
||||
stating that You changed the files; and
|
||||
|
||||
(c) You must retain, in the Source form of any Derivative Works
|
||||
that You distribute, all copyright, patent, trademark, and
|
||||
attribution notices from the Source form of the Work,
|
||||
excluding those notices that do not pertain to any part of
|
||||
the Derivative Works; and
|
||||
|
||||
(d) If the Work includes a "NOTICE" text file as part of its
|
||||
distribution, then any Derivative Works that You distribute must
|
||||
include a readable copy of the attribution notices contained
|
||||
within such NOTICE file, excluding those notices that do not
|
||||
pertain to any part of the Derivative Works, in at least one
|
||||
of the following places: within a NOTICE text file distributed
|
||||
as part of the Derivative Works; within the Source form or
|
||||
documentation, if provided along with the Derivative Works; or,
|
||||
within a display generated by the Derivative Works, if and
|
||||
wherever such third-party notices normally appear. The contents
|
||||
of the NOTICE file are for informational purposes only and
|
||||
do not modify the License. You may add Your own attribution
|
||||
notices within Derivative Works that You distribute, alongside
|
||||
or as an addendum to the NOTICE text from the Work, provided
|
||||
that such additional attribution notices cannot be construed
|
||||
as modifying the License.
|
||||
|
||||
You may add Your own copyright statement to Your modifications and
|
||||
may provide additional or different license terms and conditions
|
||||
for use, reproduction, or distribution of Your modifications, or
|
||||
for any such Derivative Works as a whole, provided Your use,
|
||||
reproduction, and distribution of the Work otherwise complies with
|
||||
the conditions stated in this License.
|
||||
|
||||
5. Submission of Contributions. Unless You explicitly state otherwise,
|
||||
any Contribution intentionally submitted for inclusion in the Work
|
||||
by You to the Licensor shall be under the terms and conditions of
|
||||
this License, without any additional terms or conditions.
|
||||
Notwithstanding the above, nothing herein shall supersede or modify
|
||||
the terms of any separate license agreement you may have executed
|
||||
with Licensor regarding such Contributions.
|
||||
|
||||
6. Trademarks. This License does not grant permission to use the trade
|
||||
names, trademarks, service marks, or product names of the Licensor,
|
||||
except as required for reasonable and customary use in describing the
|
||||
origin of the Work and reproducing the content of the NOTICE file.
|
||||
|
||||
7. Disclaimer of Warranty. Unless required by applicable law or
|
||||
agreed to in writing, Licensor provides the Work (and each
|
||||
Contributor provides its Contributions) on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
||||
implied, including, without limitation, any warranties or conditions
|
||||
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
||||
PARTICULAR PURPOSE. You are solely responsible for determining the
|
||||
appropriateness of using or redistributing the Work and assume any
|
||||
risks associated with Your exercise of permissions under this License.
|
||||
|
||||
8. Limitation of Liability. In no event and under no legal theory,
|
||||
whether in tort (including negligence), contract, or otherwise,
|
||||
unless required by applicable law (such as deliberate and grossly
|
||||
negligent acts) or agreed to in writing, shall any Contributor be
|
||||
liable to You for damages, including any direct, indirect, special,
|
||||
incidental, or consequential damages of any character arising as a
|
||||
result of this License or out of the use or inability to use the
|
||||
Work (including but not limited to damages for loss of goodwill,
|
||||
work stoppage, computer failure or malfunction, or any and all
|
||||
other commercial damages or losses), even if such Contributor
|
||||
has been advised of the possibility of such damages.
|
||||
|
||||
9. Accepting Warranty or Additional Liability. While redistributing
|
||||
the Work or Derivative Works thereof, You may choose to offer,
|
||||
and charge a fee for, acceptance of support, warranty, indemnity,
|
||||
or other liability obligations and/or rights consistent with this
|
||||
License. However, in accepting such obligations, You may act only
|
||||
on Your own behalf and on Your sole responsibility, not on behalf
|
||||
of any other Contributor, and only if You agree to indemnify,
|
||||
defend, and hold each Contributor harmless for any liability
|
||||
incurred by, or claims asserted against, such Contributor by reason
|
||||
of your accepting any such warranty or additional liability.
|
||||
|
||||
END OF TERMS AND CONDITIONS
|
||||
|
||||
APPENDIX: How to apply the Apache License to your work.
|
||||
|
||||
To apply the Apache License to your work, attach the following
|
||||
boilerplate notice, with the fields enclosed by brackets "[]"
|
||||
replaced with your own identifying information. (Don't include
|
||||
the brackets!) The text should be enclosed in the appropriate
|
||||
comment syntax for the file format. We also recommend that a
|
||||
file or class name and description of purpose be included on the
|
||||
same "printed page" as the copyright notice for easier
|
||||
identification within third-party archives.
|
||||
|
||||
Copyright [2020] [Paul Davis <paul.joseph.davis@gmail.com>]
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
@ -0,0 +1,154 @@
|
||||
Metadata-Version: 2.1
|
||||
Name: ghp-import
|
||||
Version: 2.1.0
|
||||
Summary: Copy your docs directly to the gh-pages branch.
|
||||
Home-page: https://github.com/c-w/ghp-import
|
||||
Author: Paul Joseph Davis
|
||||
Author-email: paul.joseph.davis@gmail.com
|
||||
License: Apache Software License
|
||||
Platform: UNKNOWN
|
||||
Classifier: Development Status :: 5 - Production/Stable
|
||||
Classifier: License :: OSI Approved :: Apache Software License
|
||||
Classifier: Intended Audience :: Developers
|
||||
Classifier: Natural Language :: English
|
||||
Classifier: Operating System :: OS Independent
|
||||
Classifier: Programming Language :: Python
|
||||
Classifier: Programming Language :: Python :: 2
|
||||
Classifier: Programming Language :: Python :: 3
|
||||
Description-Content-Type: text/markdown
|
||||
License-File: LICENSE
|
||||
Requires-Dist: python-dateutil (>=2.8.1)
|
||||
Provides-Extra: dev
|
||||
Requires-Dist: twine ; extra == 'dev'
|
||||
Requires-Dist: markdown ; extra == 'dev'
|
||||
Requires-Dist: flake8 ; extra == 'dev'
|
||||
Requires-Dist: wheel ; extra == 'dev'
|
||||
|
||||
GitHub Pages Import
|
||||
===================
|
||||
|
||||
[](https://github.com/davisp/ghp-import/actions?query=workflow%3Aci)
|
||||
[](https://circleci.com/gh/c-w/ghp-import/tree/master)
|
||||
[](https://travis-ci.org/c-w/ghp-import)
|
||||
|
||||
[](https://opensource.org/licenses/Apache-2.0)
|
||||
[](https://pypi.org/project/ghp-import/)
|
||||
|
||||
As part of [gunicorn][gunicorn], [Benoit Chesneau][benoit] and [Paul Davis][davisp]
|
||||
were looking at how to host documentation. There's the obvious method of
|
||||
using [GitHub's post-receive hook][github-post] to trigger doc builds and rsync
|
||||
to a webserver, but we ended up wanting to try out github's hosting to make the
|
||||
whole interface a bit more robust.
|
||||
|
||||
[GitHub Pages][gh-pages] is a pretty awesome service that GitHub provides for
|
||||
hosting project documentation. The only thing is that it requires a
|
||||
`gh-pages` branch that is the site's document root. This means that keeping
|
||||
documentation sources in the branch with code is a bit difficult. And it really
|
||||
turns into a head scratcher for things like [Sphinx][sphinx] that want to
|
||||
access documentation sources and code sources at the same time.
|
||||
|
||||
Then we stumbled across an interesting looking package called
|
||||
[github-tools][github-tools] that looked almost like what we wanted. It was a tad
|
||||
complicated and more involved than we wanted but it gave us an idea. Why not
|
||||
just write a script that can copy a directory to the `gh-pages` branch of the
|
||||
repository. This saves us from even having to think about the branch and
|
||||
everything becomes magical.
|
||||
|
||||
This is what `ghp-import` was written for.
|
||||
|
||||
[gunicorn]: http://www.gunicorn.com/ "Gunicorn"
|
||||
[benoit]: http://github.com/benoitc "Benoît Chesneau"
|
||||
[davisp]: http://github.com/davisp "Paul J. Davis"
|
||||
[github-post]: https://help.github.com/articles/post-receive-hooks "GitHub Post-Receive Hook"
|
||||
[gh-pages]: http://pages.github.com/ "GitHub Pages"
|
||||
[sphinx]: http://sphinx.pocoo.org/ "Sphinx Documentation"
|
||||
[github-tools]: http://dinoboff.github.com/github-tools/ "github-tools"
|
||||
|
||||
|
||||
Big Fat Warning
|
||||
---------------
|
||||
|
||||
This will **DESTROY** your `gh-pages` branch. If you love it, you'll want to
|
||||
take backups before playing with this. This script assumes that `gh-pages` is
|
||||
100% derivative. You should never edit files in your `gh-pages` branch by hand
|
||||
if you're using this script because you will lose your work.
|
||||
|
||||
When used with a prefix, only files below the set prefix will be destroyed, limiting the
|
||||
above warning to just that directory and everything below it.
|
||||
|
||||
Usage
|
||||
-----
|
||||
|
||||
```
|
||||
Usage: ghp-import [OPTIONS] DIRECTORY
|
||||
|
||||
Options:
|
||||
-n, --no-jekyll Include a .nojekyll file in the branch.
|
||||
-c CNAME, --cname=CNAME
|
||||
Write a CNAME file with the given CNAME.
|
||||
-m MESG, --message=MESG
|
||||
The commit message to use on the target branch.
|
||||
-p, --push Push the branch to origin/{branch} after committing.
|
||||
-x PREFIX, --prefix=PREFIX
|
||||
The prefix to add to each file that gets pushed to the
|
||||
remote. Only files below this prefix will be cleared
|
||||
out. [none]
|
||||
-f, --force Force the push to the repository.
|
||||
-o, --no-history Force new commit without parent history.
|
||||
-r REMOTE, --remote=REMOTE
|
||||
The name of the remote to push to. [origin]
|
||||
-b BRANCH, --branch=BRANCH
|
||||
Name of the branch to write to. [gh-pages]
|
||||
-s, --shell Use the shell when invoking Git. [False]
|
||||
-l, --follow-links Follow symlinks when adding files. [False]
|
||||
-h, --help show this help message and exit
|
||||
```
|
||||
|
||||
Its pretty simple. Inside your repository just run `ghp-import $DOCS_DIR`
|
||||
where `$DOCS_DIR` is the path to the **built** documentation. This will write a
|
||||
commit to your `gh-pages` branch with the current documents in it.
|
||||
|
||||
If you specify `-p` it will also attempt to push the `gh-pages` branch to
|
||||
GitHub. By default it'll just run `git push origin gh-pages`. You can specify
|
||||
a different remote using the `-r` flag.
|
||||
|
||||
The `-o` option will discard any previous history and ensure that only a
|
||||
single commit is always pushed to the `gh-pages` branch. This is useful to
|
||||
avoid bloating the repository size and is **highly recommended**.
|
||||
|
||||
You can specify a different branch with `-b`. This is useful for user and
|
||||
organization page, which are served from the `master` branch.
|
||||
|
||||
Some Windows users report needing to pass Git commands through the shell which can be accomplished by passing `-s`.
|
||||
|
||||
The `-l` option will cause the import to follow symlinks for users that have odd configurations that include symlinking outside of their documentation directory.
|
||||
|
||||
Python Usage
|
||||
------------
|
||||
|
||||
You can also call ghp_import directly from your Python code as a library. The
|
||||
library has one public function `ghp_import.ghp_import`, which accepts the
|
||||
following arguments:
|
||||
|
||||
* `srcdir`: The path to the **built** documentation (required).
|
||||
* `remote`: The name of the remote to push to. Default: `origin`.
|
||||
* `branch`: Name of the branch to write to. Default: `gh-pages`.
|
||||
* `mesg`: The commit message to use on the target branch. Default: `Update documentation`.
|
||||
* `push`: Push the branch to {remote}/{branch} after committing. Default: `False`.
|
||||
* `prefix`: The prefix to add to each file that gets pushed to the remote. Default: `None`.
|
||||
* `force`: Force the push to the repository. Default: `False`.
|
||||
* `no_history`: Force new commit without parent history. Default: `False`.
|
||||
* `use_shell`: Default: Use the shell when invoking Git. `False`.
|
||||
* `followlinks`: Follow symlinks when adding files. Default: `False`.
|
||||
* `cname`: Write a CNAME file with the given CNAME. Default: `None`.
|
||||
* `nojekyll`: Include a .nojekyll file in the branch. Default: `False`.
|
||||
|
||||
With Python's current working directory (cwd) inside your repository, do the
|
||||
following:
|
||||
|
||||
```python
|
||||
from ghp_import import ghp_import
|
||||
ghp_import('docs', push=True, cname='example.com')
|
||||
```
|
||||
|
||||
|
||||
@ -0,0 +1,10 @@
|
||||
../../../bin/ghp-import,sha256=EkLD9La0ChhbncgMfGa6pTztBlH6VnVGXt7GS1dRghA,255
|
||||
__pycache__/ghp_import.cpython-311.pyc,,
|
||||
ghp_import-2.1.0.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
|
||||
ghp_import-2.1.0.dist-info/LICENSE,sha256=C8j_tF8m7dHNDeT1BCWuLLRsWMbYrBE5hNQSC-NVr6k,11374
|
||||
ghp_import-2.1.0.dist-info/METADATA,sha256=PCrYmDTJ2XjIuUkYM33d1t8Fva95SG2UphvN-t9b6y8,7177
|
||||
ghp_import-2.1.0.dist-info/RECORD,,
|
||||
ghp_import-2.1.0.dist-info/WHEEL,sha256=G16H4A3IeoQmnOrYV4ueZGKSjhipXx8zc8nu9FGlvMA,92
|
||||
ghp_import-2.1.0.dist-info/entry_points.txt,sha256=mk55YA2cS0KmQK9APJFk_1ny9zFWFOZDtOBZMO0cu1Y,48
|
||||
ghp_import-2.1.0.dist-info/top_level.txt,sha256=QGVcxjaCFAMEV3ZX7ADAlIMIlsiyfplHNQi-JwrTgow,11
|
||||
ghp_import.py,sha256=zvDcFrdka_GzgEkD1BjJrBfDHD3sCExSbnJHmBE1igU,9234
|
||||
@ -0,0 +1,5 @@
|
||||
Wheel-Version: 1.0
|
||||
Generator: bdist_wheel (0.37.1)
|
||||
Root-Is-Purelib: true
|
||||
Tag: py3-none-any
|
||||
|
||||
@ -0,0 +1,3 @@
|
||||
[console_scripts]
|
||||
ghp-import = ghp_import:main
|
||||
|
||||
@ -0,0 +1 @@
|
||||
ghp_import
|
||||
306
venv/lib/python3.11/site-packages/ghp_import.py
Normal file
306
venv/lib/python3.11/site-packages/ghp_import.py
Normal file
@ -0,0 +1,306 @@
|
||||
#! /usr/bin/env python
|
||||
|
||||
import errno
|
||||
import os
|
||||
import subprocess as sp
|
||||
import sys
|
||||
import time
|
||||
from dateutil import tz
|
||||
from datetime import datetime
|
||||
|
||||
try:
|
||||
from shlex import quote
|
||||
except ImportError:
|
||||
from pipes import quote
|
||||
|
||||
__all__ = ['ghp_import']
|
||||
__version__ = "2.1.0"
|
||||
|
||||
|
||||
class GhpError(Exception):
|
||||
def __init__(self, message):
|
||||
self.message = message
|
||||
|
||||
|
||||
if sys.version_info[0] == 3:
|
||||
def enc(text):
|
||||
if isinstance(text, bytes):
|
||||
return text
|
||||
return text.encode()
|
||||
|
||||
def dec(text):
|
||||
if isinstance(text, bytes):
|
||||
return text.decode('utf-8')
|
||||
return text
|
||||
|
||||
def write(pipe, data):
|
||||
try:
|
||||
pipe.stdin.write(data)
|
||||
except IOError as e:
|
||||
if e.errno != errno.EPIPE:
|
||||
raise
|
||||
else:
|
||||
def enc(text):
|
||||
if isinstance(text, unicode): # noqa F821
|
||||
return text.encode('utf-8')
|
||||
return text
|
||||
|
||||
def dec(text):
|
||||
if isinstance(text, unicode): # noqa F821
|
||||
return text
|
||||
return text.decode('utf-8')
|
||||
|
||||
def write(pipe, data):
|
||||
pipe.stdin.write(data)
|
||||
|
||||
|
||||
class Git(object):
|
||||
def __init__(self, use_shell=False):
|
||||
self.use_shell = use_shell
|
||||
|
||||
self.cmd = None
|
||||
self.pipe = None
|
||||
self.stderr = None
|
||||
self.stdout = None
|
||||
|
||||
def check_repo(self):
|
||||
if self.call('rev-parse') != 0:
|
||||
error = self.stderr
|
||||
if not error:
|
||||
error = "Unknown Git error"
|
||||
error = dec(error)
|
||||
if error.startswith("fatal: "):
|
||||
error = error[len("fatal: "):]
|
||||
raise GhpError(error)
|
||||
|
||||
def try_rebase(self, remote, branch, no_history=False):
|
||||
rc = self.call('rev-list', '--max-count=1', '%s/%s' % (remote, branch))
|
||||
if rc != 0:
|
||||
return True
|
||||
rev = dec(self.stdout.strip())
|
||||
if no_history:
|
||||
rc = self.call('update-ref', '-d', 'refs/heads/%s' % branch)
|
||||
else:
|
||||
rc = self.call('update-ref', 'refs/heads/%s' % branch, rev)
|
||||
if rc != 0:
|
||||
return False
|
||||
return True
|
||||
|
||||
def get_config(self, key):
|
||||
self.call('config', key)
|
||||
return self.stdout.strip()
|
||||
|
||||
def get_prev_commit(self, branch):
|
||||
rc = self.call('rev-list', '--max-count=1', branch, '--')
|
||||
if rc != 0:
|
||||
return None
|
||||
return dec(self.stdout).strip()
|
||||
|
||||
def open(self, *args, **kwargs):
|
||||
if self.use_shell:
|
||||
self.cmd = 'git ' + ' '.join(map(quote, args))
|
||||
else:
|
||||
self.cmd = ['git'] + list(args)
|
||||
if sys.version_info >= (3, 2, 0):
|
||||
kwargs['universal_newlines'] = False
|
||||
for k in 'stdin stdout stderr'.split():
|
||||
kwargs.setdefault(k, sp.PIPE)
|
||||
kwargs['shell'] = self.use_shell
|
||||
self.pipe = sp.Popen(self.cmd, **kwargs)
|
||||
return self.pipe
|
||||
|
||||
def call(self, *args, **kwargs):
|
||||
self.open(*args, **kwargs)
|
||||
(self.stdout, self.stderr) = self.pipe.communicate()
|
||||
return self.pipe.wait()
|
||||
|
||||
def check_call(self, *args, **kwargs):
|
||||
kwargs["shell"] = self.use_shell
|
||||
sp.check_call(['git'] + list(args), **kwargs)
|
||||
|
||||
|
||||
def mk_when(timestamp=None):
|
||||
if timestamp is None:
|
||||
timestamp = int(time.time())
|
||||
currtz = datetime.now(tz.tzlocal()).strftime('%z')
|
||||
return "%s %s" % (timestamp, currtz)
|
||||
|
||||
|
||||
def start_commit(pipe, git, branch, message, prefix=None):
|
||||
uname = os.getenv('GIT_COMMITTER_NAME', dec(git.get_config('user.name')))
|
||||
email = os.getenv('GIT_COMMITTER_EMAIL', dec(git.get_config('user.email')))
|
||||
when = os.getenv('GIT_COMMITTER_DATE', mk_when())
|
||||
write(pipe, enc('commit refs/heads/%s\n' % branch))
|
||||
write(pipe, enc('committer %s <%s> %s\n' % (uname, email, when)))
|
||||
write(pipe, enc('data %d\n%s\n' % (len(enc(message)), message)))
|
||||
head = git.get_prev_commit(branch)
|
||||
if head:
|
||||
write(pipe, enc('from %s\n' % head))
|
||||
if prefix:
|
||||
write(pipe, enc('D %s\n' % prefix))
|
||||
else:
|
||||
write(pipe, enc('deleteall\n'))
|
||||
|
||||
|
||||
def add_file(pipe, srcpath, tgtpath):
|
||||
with open(srcpath, "rb") as handle:
|
||||
if os.access(srcpath, os.X_OK):
|
||||
write(pipe, enc('M 100755 inline %s\n' % tgtpath))
|
||||
else:
|
||||
write(pipe, enc('M 100644 inline %s\n' % tgtpath))
|
||||
data = handle.read()
|
||||
write(pipe, enc('data %d\n' % len(data)))
|
||||
write(pipe, enc(data))
|
||||
write(pipe, enc('\n'))
|
||||
|
||||
|
||||
def add_nojekyll(pipe, prefix=None):
|
||||
if prefix:
|
||||
fpath = os.path.join(prefix, '.nojekyll')
|
||||
else:
|
||||
fpath = '.nojekyll'
|
||||
write(pipe, enc('M 100644 inline %s\n' % fpath))
|
||||
write(pipe, enc('data 0\n'))
|
||||
write(pipe, enc('\n'))
|
||||
|
||||
|
||||
def add_cname(pipe, cname):
|
||||
write(pipe, enc('M 100644 inline CNAME\n'))
|
||||
write(pipe, enc('data %d\n%s\n' % (len(enc(cname)), cname)))
|
||||
|
||||
|
||||
def gitpath(fname):
|
||||
norm = os.path.normpath(fname)
|
||||
return "/".join(norm.split(os.path.sep))
|
||||
|
||||
|
||||
def run_import(git, srcdir, **opts):
|
||||
srcdir = dec(srcdir)
|
||||
pipe = git.open('fast-import', '--date-format=rfc2822', '--quiet',
|
||||
stdin=sp.PIPE, stdout=None, stderr=None)
|
||||
start_commit(pipe, git, opts['branch'], opts['mesg'], opts['prefix'])
|
||||
for path, _, fnames in os.walk(srcdir, followlinks=opts['followlinks']):
|
||||
for fn in fnames:
|
||||
fpath = os.path.join(path, fn)
|
||||
gpath = gitpath(os.path.relpath(fpath, start=srcdir))
|
||||
if opts['prefix']:
|
||||
gpath = os.path.join(opts['prefix'], gpath)
|
||||
add_file(pipe, fpath, gpath)
|
||||
if opts['nojekyll']:
|
||||
add_nojekyll(pipe, opts['prefix'])
|
||||
if opts['cname'] is not None:
|
||||
add_cname(pipe, opts['cname'])
|
||||
write(pipe, enc('\n'))
|
||||
pipe.stdin.close()
|
||||
if pipe.wait() != 0:
|
||||
sys.stdout.write(enc("Failed to process commit.\n"))
|
||||
|
||||
|
||||
def options():
|
||||
return [
|
||||
(('-n', '--no-jekyll'), dict(
|
||||
dest='nojekyll',
|
||||
default=False,
|
||||
action="store_true",
|
||||
help='Include a .nojekyll file in the branch.',
|
||||
)),
|
||||
(('-c', '--cname'), dict(
|
||||
dest='cname',
|
||||
default=None,
|
||||
help='Write a CNAME file with the given CNAME.',
|
||||
)),
|
||||
(('-m', '--message'), dict(
|
||||
dest='mesg',
|
||||
default='Update documentation',
|
||||
help='The commit message to use on the target branch.',
|
||||
)),
|
||||
(('-p', '--push'), dict(
|
||||
dest='push',
|
||||
default=False,
|
||||
action='store_true',
|
||||
help='Push the branch to origin/{branch} after committing.',
|
||||
)),
|
||||
(('-x', '--prefix'), dict(
|
||||
dest='prefix',
|
||||
default=None,
|
||||
help='The prefix to add to each file that gets pushed to the '
|
||||
'remote. Only files below this prefix will be cleared '
|
||||
'out. [%(default)s]',
|
||||
)),
|
||||
(('-f', '--force'), dict(
|
||||
dest='force',
|
||||
default=False, action='store_true',
|
||||
help='Force the push to the repository.',
|
||||
)),
|
||||
(('-o', '--no-history'), dict(
|
||||
dest='no_history',
|
||||
default=False,
|
||||
action='store_true',
|
||||
help='Force new commit without parent history.',
|
||||
)),
|
||||
(('-r', '--remote'), dict(
|
||||
dest='remote',
|
||||
default='origin',
|
||||
help='The name of the remote to push to. [%(default)s]',
|
||||
)),
|
||||
(('-b', '--branch'), dict(
|
||||
dest='branch',
|
||||
default='gh-pages',
|
||||
help='Name of the branch to write to. [%(default)s]',
|
||||
)),
|
||||
(('-s', '--shell'), dict(
|
||||
dest='use_shell',
|
||||
default=False,
|
||||
action='store_true',
|
||||
help='Use the shell when invoking Git. [%(default)s]',
|
||||
)),
|
||||
(('-l', '--follow-links'), dict(
|
||||
dest='followlinks',
|
||||
default=False,
|
||||
action='store_true',
|
||||
help='Follow symlinks when adding files. [%(default)s]',
|
||||
))
|
||||
]
|
||||
|
||||
|
||||
def ghp_import(srcdir, **kwargs):
|
||||
if not os.path.isdir(srcdir):
|
||||
raise GhpError("Not a directory: %s" % srcdir)
|
||||
|
||||
opts = {kwargs["dest"]: kwargs["default"] for _, kwargs in options()}
|
||||
opts.update(kwargs)
|
||||
|
||||
git = Git(use_shell=opts['use_shell'])
|
||||
git.check_repo()
|
||||
|
||||
if not git.try_rebase(opts['remote'], opts['branch'], opts['no_history']):
|
||||
raise GhpError("Failed to rebase %s branch." % opts['branch'])
|
||||
|
||||
run_import(git, srcdir, **opts)
|
||||
|
||||
if opts['push']:
|
||||
if opts['force'] or opts['no_history']:
|
||||
git.check_call('push', opts['remote'], opts['branch'], '--force')
|
||||
else:
|
||||
git.check_call('push', opts['remote'], opts['branch'])
|
||||
|
||||
|
||||
def main():
|
||||
from argparse import ArgumentParser
|
||||
|
||||
parser = ArgumentParser()
|
||||
parser.add_argument("--version", action="version", version=__version__)
|
||||
parser.add_argument("directory")
|
||||
for args, kwargs in options():
|
||||
parser.add_argument(*args, **kwargs)
|
||||
|
||||
args = parser.parse_args().__dict__
|
||||
|
||||
try:
|
||||
ghp_import(args.pop("directory"), **args)
|
||||
except GhpError as e:
|
||||
parser.error(e.message)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
61
venv/lib/python3.11/site-packages/markdown/__init__.py
Normal file
61
venv/lib/python3.11/site-packages/markdown/__init__.py
Normal file
@ -0,0 +1,61 @@
|
||||
"""
|
||||
Python Markdown
|
||||
|
||||
A Python implementation of John Gruber's Markdown.
|
||||
|
||||
Documentation: https://python-markdown.github.io/
|
||||
GitHub: https://github.com/Python-Markdown/markdown/
|
||||
PyPI: https://pypi.org/project/Markdown/
|
||||
|
||||
Started by Manfred Stienstra (http://www.dwerg.net/).
|
||||
Maintained for a few years by Yuri Takhteyev (http://www.freewisdom.org).
|
||||
Currently maintained by Waylan Limberg (https://github.com/waylan),
|
||||
Dmitry Shachnev (https://github.com/mitya57) and Isaac Muse (https://github.com/facelessuser).
|
||||
|
||||
Copyright 2007-2018 The Python Markdown Project (v. 1.7 and later)
|
||||
Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b)
|
||||
Copyright 2004 Manfred Stienstra (the original version)
|
||||
|
||||
License: BSD (see LICENSE.md for details).
|
||||
"""
|
||||
|
||||
import sys
|
||||
|
||||
# TODO: Remove this check at some point in the future.
|
||||
# (also remove flake8's 'ignore E402' comments below)
|
||||
if sys.version_info[0] < 3: # pragma: no cover
|
||||
raise ImportError('A recent version of Python 3 is required.')
|
||||
|
||||
from .core import Markdown, markdown, markdownFromFile # noqa: E402
|
||||
from .util import PY37 # noqa: E402
|
||||
from .pep562 import Pep562 # noqa: E402
|
||||
from .__meta__ import __version__, __version_info__ # noqa: E402
|
||||
import warnings # noqa: E402
|
||||
|
||||
# For backward compatibility as some extensions expect it...
|
||||
from .extensions import Extension # noqa
|
||||
|
||||
__all__ = ['Markdown', 'markdown', 'markdownFromFile']
|
||||
|
||||
__deprecated__ = {
|
||||
"version": ("__version__", __version__),
|
||||
"version_info": ("__version_info__", __version_info__)
|
||||
}
|
||||
|
||||
|
||||
def __getattr__(name):
|
||||
"""Get attribute."""
|
||||
|
||||
deprecated = __deprecated__.get(name)
|
||||
if deprecated:
|
||||
warnings.warn(
|
||||
"'{}' is deprecated. Use '{}' instead.".format(name, deprecated[0]),
|
||||
category=DeprecationWarning,
|
||||
stacklevel=(3 if PY37 else 4)
|
||||
)
|
||||
return deprecated[1]
|
||||
raise AttributeError("module '{}' has no attribute '{}'".format(__name__, name))
|
||||
|
||||
|
||||
if not PY37:
|
||||
Pep562(__name__)
|
||||
151
venv/lib/python3.11/site-packages/markdown/__main__.py
Normal file
151
venv/lib/python3.11/site-packages/markdown/__main__.py
Normal file
@ -0,0 +1,151 @@
|
||||
"""
|
||||
Python Markdown
|
||||
|
||||
A Python implementation of John Gruber's Markdown.
|
||||
|
||||
Documentation: https://python-markdown.github.io/
|
||||
GitHub: https://github.com/Python-Markdown/markdown/
|
||||
PyPI: https://pypi.org/project/Markdown/
|
||||
|
||||
Started by Manfred Stienstra (http://www.dwerg.net/).
|
||||
Maintained for a few years by Yuri Takhteyev (http://www.freewisdom.org).
|
||||
Currently maintained by Waylan Limberg (https://github.com/waylan),
|
||||
Dmitry Shachnev (https://github.com/mitya57) and Isaac Muse (https://github.com/facelessuser).
|
||||
|
||||
Copyright 2007-2018 The Python Markdown Project (v. 1.7 and later)
|
||||
Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b)
|
||||
Copyright 2004 Manfred Stienstra (the original version)
|
||||
|
||||
License: BSD (see LICENSE.md for details).
|
||||
"""
|
||||
|
||||
import sys
|
||||
import optparse
|
||||
import codecs
|
||||
import warnings
|
||||
import markdown
|
||||
try:
|
||||
# We use `unsafe_load` because users may need to pass in actual Python
|
||||
# objects. As this is only available from the CLI, the user has much
|
||||
# worse problems if an attacker can use this as an attach vector.
|
||||
from yaml import unsafe_load as yaml_load
|
||||
except ImportError: # pragma: no cover
|
||||
try:
|
||||
# Fall back to PyYAML <5.1
|
||||
from yaml import load as yaml_load
|
||||
except ImportError:
|
||||
# Fall back to JSON
|
||||
from json import load as yaml_load
|
||||
|
||||
import logging
|
||||
from logging import DEBUG, WARNING, CRITICAL
|
||||
|
||||
logger = logging.getLogger('MARKDOWN')
|
||||
|
||||
|
||||
def parse_options(args=None, values=None):
|
||||
"""
|
||||
Define and parse `optparse` options for command-line usage.
|
||||
"""
|
||||
usage = """%prog [options] [INPUTFILE]
|
||||
(STDIN is assumed if no INPUTFILE is given)"""
|
||||
desc = "A Python implementation of John Gruber's Markdown. " \
|
||||
"https://Python-Markdown.github.io/"
|
||||
ver = "%%prog %s" % markdown.__version__
|
||||
|
||||
parser = optparse.OptionParser(usage=usage, description=desc, version=ver)
|
||||
parser.add_option("-f", "--file", dest="filename", default=None,
|
||||
help="Write output to OUTPUT_FILE. Defaults to STDOUT.",
|
||||
metavar="OUTPUT_FILE")
|
||||
parser.add_option("-e", "--encoding", dest="encoding",
|
||||
help="Encoding for input and output files.",)
|
||||
parser.add_option("-o", "--output_format", dest="output_format",
|
||||
default='xhtml', metavar="OUTPUT_FORMAT",
|
||||
help="Use output format 'xhtml' (default) or 'html'.")
|
||||
parser.add_option("-n", "--no_lazy_ol", dest="lazy_ol",
|
||||
action='store_false', default=True,
|
||||
help="Observe number of first item of ordered lists.")
|
||||
parser.add_option("-x", "--extension", action="append", dest="extensions",
|
||||
help="Load extension EXTENSION.", metavar="EXTENSION")
|
||||
parser.add_option("-c", "--extension_configs",
|
||||
dest="configfile", default=None,
|
||||
help="Read extension configurations from CONFIG_FILE. "
|
||||
"CONFIG_FILE must be of JSON or YAML format. YAML "
|
||||
"format requires that a python YAML library be "
|
||||
"installed. The parsed JSON or YAML must result in a "
|
||||
"python dictionary which would be accepted by the "
|
||||
"'extension_configs' keyword on the markdown.Markdown "
|
||||
"class. The extensions must also be loaded with the "
|
||||
"`--extension` option.",
|
||||
metavar="CONFIG_FILE")
|
||||
parser.add_option("-q", "--quiet", default=CRITICAL,
|
||||
action="store_const", const=CRITICAL+10, dest="verbose",
|
||||
help="Suppress all warnings.")
|
||||
parser.add_option("-v", "--verbose",
|
||||
action="store_const", const=WARNING, dest="verbose",
|
||||
help="Print all warnings.")
|
||||
parser.add_option("--noisy",
|
||||
action="store_const", const=DEBUG, dest="verbose",
|
||||
help="Print debug messages.")
|
||||
|
||||
(options, args) = parser.parse_args(args, values)
|
||||
|
||||
if len(args) == 0:
|
||||
input_file = None
|
||||
else:
|
||||
input_file = args[0]
|
||||
|
||||
if not options.extensions:
|
||||
options.extensions = []
|
||||
|
||||
extension_configs = {}
|
||||
if options.configfile:
|
||||
with codecs.open(
|
||||
options.configfile, mode="r", encoding=options.encoding
|
||||
) as fp:
|
||||
try:
|
||||
extension_configs = yaml_load(fp)
|
||||
except Exception as e:
|
||||
message = "Failed parsing extension config file: %s" % \
|
||||
options.configfile
|
||||
e.args = (message,) + e.args[1:]
|
||||
raise
|
||||
|
||||
opts = {
|
||||
'input': input_file,
|
||||
'output': options.filename,
|
||||
'extensions': options.extensions,
|
||||
'extension_configs': extension_configs,
|
||||
'encoding': options.encoding,
|
||||
'output_format': options.output_format,
|
||||
'lazy_ol': options.lazy_ol
|
||||
}
|
||||
|
||||
return opts, options.verbose
|
||||
|
||||
|
||||
def run(): # pragma: no cover
|
||||
"""Run Markdown from the command line."""
|
||||
|
||||
# Parse options and adjust logging level if necessary
|
||||
options, logging_level = parse_options()
|
||||
if not options:
|
||||
sys.exit(2)
|
||||
logger.setLevel(logging_level)
|
||||
console_handler = logging.StreamHandler()
|
||||
logger.addHandler(console_handler)
|
||||
if logging_level <= WARNING:
|
||||
# Ensure deprecation warnings get displayed
|
||||
warnings.filterwarnings('default')
|
||||
logging.captureWarnings(True)
|
||||
warn_logger = logging.getLogger('py.warnings')
|
||||
warn_logger.addHandler(console_handler)
|
||||
|
||||
# Run
|
||||
markdown.markdownFromFile(**options)
|
||||
|
||||
|
||||
if __name__ == '__main__': # pragma: no cover
|
||||
# Support running module as a commandline command.
|
||||
# `python -m markdown [options] [args]`.
|
||||
run()
|
||||
49
venv/lib/python3.11/site-packages/markdown/__meta__.py
Normal file
49
venv/lib/python3.11/site-packages/markdown/__meta__.py
Normal file
@ -0,0 +1,49 @@
|
||||
"""
|
||||
Python Markdown
|
||||
|
||||
A Python implementation of John Gruber's Markdown.
|
||||
|
||||
Documentation: https://python-markdown.github.io/
|
||||
GitHub: https://github.com/Python-Markdown/markdown/
|
||||
PyPI: https://pypi.org/project/Markdown/
|
||||
|
||||
Started by Manfred Stienstra (http://www.dwerg.net/).
|
||||
Maintained for a few years by Yuri Takhteyev (http://www.freewisdom.org).
|
||||
Currently maintained by Waylan Limberg (https://github.com/waylan),
|
||||
Dmitry Shachnev (https://github.com/mitya57) and Isaac Muse (https://github.com/facelessuser).
|
||||
|
||||
Copyright 2007-2018 The Python Markdown Project (v. 1.7 and later)
|
||||
Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b)
|
||||
Copyright 2004 Manfred Stienstra (the original version)
|
||||
|
||||
License: BSD (see LICENSE.md for details).
|
||||
"""
|
||||
|
||||
# __version_info__ format:
|
||||
# (major, minor, patch, dev/alpha/beta/rc/final, #)
|
||||
# (1, 1, 2, 'dev', 0) => "1.1.2.dev0"
|
||||
# (1, 1, 2, 'alpha', 1) => "1.1.2a1"
|
||||
# (1, 2, 0, 'beta', 2) => "1.2b2"
|
||||
# (1, 2, 0, 'rc', 4) => "1.2rc4"
|
||||
# (1, 2, 0, 'final', 0) => "1.2"
|
||||
__version_info__ = (3, 3, 7, 'final', 0)
|
||||
|
||||
|
||||
def _get_version(version_info):
|
||||
" Returns a PEP 440-compliant version number from version_info. "
|
||||
assert len(version_info) == 5
|
||||
assert version_info[3] in ('dev', 'alpha', 'beta', 'rc', 'final')
|
||||
|
||||
parts = 2 if version_info[2] == 0 else 3
|
||||
v = '.'.join(map(str, version_info[:parts]))
|
||||
|
||||
if version_info[3] == 'dev':
|
||||
v += '.dev' + str(version_info[4])
|
||||
elif version_info[3] != 'final':
|
||||
mapping = {'alpha': 'a', 'beta': 'b', 'rc': 'rc'}
|
||||
v += mapping[version_info[3]] + str(version_info[4])
|
||||
|
||||
return v
|
||||
|
||||
|
||||
__version__ = _get_version(__version_info__)
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user