added podman, json and yaml

This commit is contained in:
2022-11-27 19:11:46 +01:00
parent 01135dea09
commit 5226e858bb
790 changed files with 114578 additions and 16 deletions

Binary file not shown.

0
containers.yaml Normal file
View File

52
main.py
View File

@ -8,16 +8,27 @@ from pydantic import BaseModel
from random import randrange from random import randrange
import os import os
import subprocess import subprocess
from podman import PodmanClient
import json
import yaml
app = FastAPI() app = FastAPI()
uri = "unix:///run/user/1000/podman/podman.sock"
podmanapi = PodmanClient(base_url=uri)
yaml_file = open("containers.yaml", 'r')
class Post(BaseModel): class Post(BaseModel):
title: str title: str
content: str content: str
published: bool = True published: bool = True
rating: Optional[int] = None rating: Optional[int] = None
class Environment(BaseModel):
username: str
uid: str
my_posts= [{"title": "title of post 1", "content": "contents of post 1", "id": 1},{"title": "title of post 2", "content": "contents of post 2", "id": 2}] my_posts= [{"title": "title of post 1", "content": "contents of post 1", "id": 1},{"title": "title of post 2", "content": "contents of post 2", "id": 2}]
def find_post(id): def find_post(id):
@ -71,15 +82,9 @@ async def update_post(id: int, post: Post):
my_posts[index] = post_dict my_posts[index] = post_dict
return {"data": post_dict} return {"data": post_dict}
@app.get("/create_env")
async def ping_host(): @app.post("/create_env", status_code=status.HTTP_201_CREATED)
# send one packet of data to the host async def create_env(env: Environment):
# this is specified by '-c 1' in the argument list
outputlist = []
# Iterate over all the servers in the list and ping each server
# get the output as a string
#output = str(os.system(cmd))
#cmd = subprocess.run(["/bin/ls", "-al"],stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True, text=True)
cmd = subprocess.run(["cat","list"],stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True, text=True) cmd = subprocess.run(["cat","list"],stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True, text=True)
# store the output in the list # store the output in the list
output = (f"{cmd.stdout}") output = (f"{cmd.stdout}")
@ -88,12 +93,20 @@ async def ping_host():
@app.get("/environment/{id}") @app.get("/environment/{id}")
async def get_env(id: str): async def get_env(id: str):
#env = find_post(id)
cmd = subprocess.run(["cat list | grep %s" % id],stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) cmd = subprocess.run(["cat list | grep %s" % id],stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
if cmd.returncode != 0: if cmd.returncode != 0:
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail=f"Post id {id} not found") raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail=f"Post id {id} not found")
else: else:
return {"post_detail": cmd} return {"environment_detail": cmd}
@app.delete("/environment/{id}")
async def get_env(id: str):
cmd = subprocess.run(["cat list | grep %s" % id],stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
if cmd.returncode != 0:
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail=f"Post id {id} not found")
else:
return {"Deleted environment": {id}}
@app.get("/list") @app.get("/list")
async def ping_host(): async def ping_host():
@ -106,9 +119,16 @@ async def ping_host():
#cmd = subprocess.run(["/bin/ls", "-al"],stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True, text=True) #cmd = subprocess.run(["/bin/ls", "-al"],stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True, text=True)
cmd = subprocess.run(["cat list"],stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) cmd = subprocess.run(["cat list"],stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
#cmd = subprocess.run(["ls /usr/bin"],stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) #cmd = subprocess.run(["ls /usr/bin"],stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
# store the output in the list
#output = (f"{cmd.stdout}")
#output1 = (f'{cmd.stdout.decode("utf-8")}')
output2 = (f'{cmd.stdout.decode("utf-8").lower()}') output2 = (f'{cmd.stdout.decode("utf-8").lower()}')
print (output2) print (output2)
return output2 #for image in podmanapi.images.list():
# print(image, image.id, "\n")
#output = podmanapi.images.list()
#json_str = json.dumps(podmanapi.df(), indent=4)
image = podmanapi.images.pull("docker.io/nginx", tag="latest")
#container = podmanapi.containers.create(image="docker.io/library/nginx:latest",name='test',detach=True,publish_all_ports=True)
container = podmanapi.containers.run(image="docker.io/library/nginx:latest",name='test',detach=True,publish_all_ports=True)
#run_container = podmanapi.containers.run(image="docker.io/library/nginx:latest",name='test',stdout=True, stderr=False)
#json_str = json.dumps(podmanapi.containers.list(), indent=4)
#print(json.dumps(podmanapi.version(), indent=4))
return Response(content=output2, media_type='application/json')

8
venv/bin/ghp-import Executable file
View File

@ -0,0 +1,8 @@
#!/home/toverhag/Documents/gitrepos/fastapi_test/venv/bin/python
# -*- coding: utf-8 -*-
import re
import sys
from ghp_import import main
if __name__ == '__main__':
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
sys.exit(main())

8
venv/bin/markdown_py Executable file
View File

@ -0,0 +1,8 @@
#!/home/toverhag/Documents/gitrepos/fastapi_test/venv/bin/python
# -*- coding: utf-8 -*-
import re
import sys
from markdown.__main__ import run
if __name__ == '__main__':
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
sys.exit(run())

8
venv/bin/mkdocs Executable file
View File

@ -0,0 +1,8 @@
#!/home/toverhag/Documents/gitrepos/fastapi_test/venv/bin/python
# -*- coding: utf-8 -*-
import re
import sys
from mkdocs.__main__ import cli
if __name__ == '__main__':
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
sys.exit(cli())

8
venv/bin/normalizer Executable file
View File

@ -0,0 +1,8 @@
#!/home/toverhag/Documents/gitrepos/fastapi_test/venv/bin/python
# -*- coding: utf-8 -*-
import re
import sys
from charset_normalizer.cli.normalizer import cli_detect
if __name__ == '__main__':
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
sys.exit(cli_detect())

8
venv/bin/watchmedo Executable file
View File

@ -0,0 +1,8 @@
#!/home/toverhag/Documents/gitrepos/fastapi_test/venv/bin/python
# -*- coding: utf-8 -*-
import re
import sys
from watchdog.watchmedo import main
if __name__ == '__main__':
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
sys.exit(main())

View File

@ -0,0 +1,29 @@
Copyright 2007, 2008 The Python Markdown Project (v. 1.7 and later)
Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b)
Copyright 2004 Manfred Stienstra (the original version)
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the Python Markdown Project nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE PYTHON MARKDOWN PROJECT ''AS IS'' AND ANY
EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL ANY CONTRIBUTORS TO THE PYTHON MARKDOWN PROJECT
BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.

View File

@ -0,0 +1,109 @@
Metadata-Version: 2.1
Name: Markdown
Version: 3.3.7
Summary: Python implementation of Markdown.
Home-page: https://Python-Markdown.github.io/
Author: Manfred Stienstra, Yuri takhteyev and Waylan limberg
Author-email: python.markdown@gmail.com
Maintainer: Waylan Limberg
Maintainer-email: python.markdown@gmail.com
License: BSD License
Project-URL: Documentation, https://Python-Markdown.github.io/
Project-URL: GitHub Project, https://github.com/Python-Markdown/markdown
Project-URL: Issue Tracker, https://github.com/Python-Markdown/markdown/issues
Platform: UNKNOWN
Classifier: Development Status :: 5 - Production/Stable
Classifier: License :: OSI Approved :: BSD License
Classifier: Operating System :: OS Independent
Classifier: Programming Language :: Python
Classifier: Programming Language :: Python :: 3
Classifier: Programming Language :: Python :: 3.6
Classifier: Programming Language :: Python :: 3.7
Classifier: Programming Language :: Python :: 3.8
Classifier: Programming Language :: Python :: 3.9
Classifier: Programming Language :: Python :: 3.10
Classifier: Programming Language :: Python :: 3 :: Only
Classifier: Programming Language :: Python :: Implementation :: CPython
Classifier: Programming Language :: Python :: Implementation :: PyPy
Classifier: Topic :: Communications :: Email :: Filters
Classifier: Topic :: Internet :: WWW/HTTP :: Dynamic Content :: CGI Tools/Libraries
Classifier: Topic :: Internet :: WWW/HTTP :: Site Management
Classifier: Topic :: Software Development :: Documentation
Classifier: Topic :: Software Development :: Libraries :: Python Modules
Classifier: Topic :: Text Processing :: Filters
Classifier: Topic :: Text Processing :: Markup :: HTML
Classifier: Topic :: Text Processing :: Markup :: Markdown
Requires-Python: >=3.6
Description-Content-Type: text/markdown
License-File: LICENSE.md
Requires-Dist: importlib-metadata (>=4.4) ; python_version < "3.10"
Provides-Extra: testing
Requires-Dist: coverage ; extra == 'testing'
Requires-Dist: pyyaml ; extra == 'testing'
[Python-Markdown][]
===================
[![Build Status][build-button]][build]
[![Coverage Status][codecov-button]][codecov]
[![Latest Version][mdversion-button]][md-pypi]
[![Python Versions][pyversion-button]][md-pypi]
[![BSD License][bsdlicense-button]][bsdlicense]
[![Code of Conduct][codeofconduct-button]][Code of Conduct]
[build-button]: https://github.com/Python-Markdown/markdown/workflows/CI/badge.svg?event=push
[build]: https://github.com/Python-Markdown/markdown/actions?query=workflow%3ACI+event%3Apush
[codecov-button]: https://codecov.io/gh/Python-Markdown/markdown/branch/master/graph/badge.svg
[codecov]: https://codecov.io/gh/Python-Markdown/markdown
[mdversion-button]: https://img.shields.io/pypi/v/Markdown.svg
[md-pypi]: https://pypi.org/project/Markdown/
[pyversion-button]: https://img.shields.io/pypi/pyversions/Markdown.svg
[bsdlicense-button]: https://img.shields.io/badge/license-BSD-yellow.svg
[bsdlicense]: https://opensource.org/licenses/BSD-3-Clause
[codeofconduct-button]: https://img.shields.io/badge/code%20of%20conduct-contributor%20covenant-green.svg?style=flat-square
[Code of Conduct]: https://github.com/Python-Markdown/markdown/blob/master/CODE_OF_CONDUCT.md
This is a Python implementation of John Gruber's [Markdown][].
It is almost completely compliant with the reference implementation,
though there are a few known issues. See [Features][] for information
on what exactly is supported and what is not. Additional features are
supported by the [Available Extensions][].
[Python-Markdown]: https://Python-Markdown.github.io/
[Markdown]: https://daringfireball.net/projects/markdown/
[Features]: https://Python-Markdown.github.io#Features
[Available Extensions]: https://Python-Markdown.github.io/extensions
Documentation
-------------
```bash
pip install markdown
```
```python
import markdown
html = markdown.markdown(your_text_string)
```
For more advanced [installation] and [usage] documentation, see the `docs/` directory
of the distribution or the project website at <https://Python-Markdown.github.io/>.
[installation]: https://python-markdown.github.io/install/
[usage]: https://python-markdown.github.io/reference/
See the change log at <https://Python-Markdown.github.io/change_log>.
Support
-------
You may report bugs, ask for help, and discuss various other issues on the [bug tracker][].
[bug tracker]: https://github.com/Python-Markdown/markdown/issues
Code of Conduct
---------------
Everyone interacting in the Python-Markdown project's codebases, issue trackers,
and mailing lists is expected to follow the [Code of Conduct].

View File

@ -0,0 +1,76 @@
../../../bin/markdown_py,sha256=5bO7NpqwwsC9tIBJKVslf9gK935iQp-pRU1efIXs_cQ,260
Markdown-3.3.7.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
Markdown-3.3.7.dist-info/LICENSE.md,sha256=bxGTy2NHGOZcOlN9biXr1hSCDsDvaTz8EiSBEmONZNo,1645
Markdown-3.3.7.dist-info/METADATA,sha256=yDPc4gkbiMBomDr_gIAPsQ8zVfkYoe9mRkon4KZ889A,4629
Markdown-3.3.7.dist-info/RECORD,,
Markdown-3.3.7.dist-info/WHEEL,sha256=G16H4A3IeoQmnOrYV4ueZGKSjhipXx8zc8nu9FGlvMA,92
Markdown-3.3.7.dist-info/entry_points.txt,sha256=lMEyiiA_ZZyfPCBlDviBl-SiU0cfoeuEKpwxw361sKQ,1102
Markdown-3.3.7.dist-info/top_level.txt,sha256=IAxs8x618RXoH1uCqeLLxXsDefJvE_mIibr_M4sOlyk,9
markdown/__init__.py,sha256=002-LuHviYzROW2rg_gBGai81nMouUNO9UFj5nSsTSk,2065
markdown/__main__.py,sha256=JX1057VoovH3NA5uH5nQdQE8b0kXoeT79ZxCzFoL_kg,5803
markdown/__meta__.py,sha256=AjQfLZ5mSCOzSQk-HNK3jedyEPUyaZMTKwyXjRxCHso,1630
markdown/__pycache__/__init__.cpython-311.pyc,,
markdown/__pycache__/__main__.cpython-311.pyc,,
markdown/__pycache__/__meta__.cpython-311.pyc,,
markdown/__pycache__/blockparser.cpython-311.pyc,,
markdown/__pycache__/blockprocessors.cpython-311.pyc,,
markdown/__pycache__/core.cpython-311.pyc,,
markdown/__pycache__/htmlparser.cpython-311.pyc,,
markdown/__pycache__/inlinepatterns.cpython-311.pyc,,
markdown/__pycache__/pep562.cpython-311.pyc,,
markdown/__pycache__/postprocessors.cpython-311.pyc,,
markdown/__pycache__/preprocessors.cpython-311.pyc,,
markdown/__pycache__/serializers.cpython-311.pyc,,
markdown/__pycache__/test_tools.cpython-311.pyc,,
markdown/__pycache__/treeprocessors.cpython-311.pyc,,
markdown/__pycache__/util.cpython-311.pyc,,
markdown/blockparser.py,sha256=JpBhOokOoBUGCXolftOc5m1hPcR2y9s9hVd9WSuhHzo,4285
markdown/blockprocessors.py,sha256=sUHIdAsGGAEN443TusbiNOWYt38DsHV-RqJdGjumIFc,24893
markdown/core.py,sha256=ZHtqvLdVHOKWIuX_UzdL3rIcxMwji5TC5ZCkV19iM4U,15401
markdown/extensions/__init__.py,sha256=kw5SehW1-IRGZk-ZR4aLQp2F0Qqj-Qql35qIBA8efnU,3663
markdown/extensions/__pycache__/__init__.cpython-311.pyc,,
markdown/extensions/__pycache__/abbr.cpython-311.pyc,,
markdown/extensions/__pycache__/admonition.cpython-311.pyc,,
markdown/extensions/__pycache__/attr_list.cpython-311.pyc,,
markdown/extensions/__pycache__/codehilite.cpython-311.pyc,,
markdown/extensions/__pycache__/def_list.cpython-311.pyc,,
markdown/extensions/__pycache__/extra.cpython-311.pyc,,
markdown/extensions/__pycache__/fenced_code.cpython-311.pyc,,
markdown/extensions/__pycache__/footnotes.cpython-311.pyc,,
markdown/extensions/__pycache__/legacy_attrs.cpython-311.pyc,,
markdown/extensions/__pycache__/legacy_em.cpython-311.pyc,,
markdown/extensions/__pycache__/md_in_html.cpython-311.pyc,,
markdown/extensions/__pycache__/meta.cpython-311.pyc,,
markdown/extensions/__pycache__/nl2br.cpython-311.pyc,,
markdown/extensions/__pycache__/sane_lists.cpython-311.pyc,,
markdown/extensions/__pycache__/smarty.cpython-311.pyc,,
markdown/extensions/__pycache__/tables.cpython-311.pyc,,
markdown/extensions/__pycache__/toc.cpython-311.pyc,,
markdown/extensions/__pycache__/wikilinks.cpython-311.pyc,,
markdown/extensions/abbr.py,sha256=5TNU5ml6-H1n-fztEkgUphSTvp5yKCXaiPZMrVuRFvo,3186
markdown/extensions/admonition.py,sha256=INIecvdzQ7RLmgP8M-N6AZJ5uMd6dBfh9Uj6YibgNLk,5847
markdown/extensions/attr_list.py,sha256=nhKFY_u6BVyKW2oMUeC4wEjqFNGpDSnNXqaohuF6M7I,5988
markdown/extensions/codehilite.py,sha256=C4Jiuc-EwQQaedIGo_2sGyoZR6YgTR0FocusKWZc6Vg,11710
markdown/extensions/def_list.py,sha256=HKStriCfwosWRjHgiph6hHIHwGjasEaE6UYW-_hstVo,3635
markdown/extensions/extra.py,sha256=ruwYAcbIaFxAmcT4pLoaRdw8Ok6sFTYWza7OAstcvtI,1831
markdown/extensions/fenced_code.py,sha256=Hd2RDaRWcCd4aI9fedoI6EElPmIVrD2BlvXdhRV64ik,7209
markdown/extensions/footnotes.py,sha256=Ux13UAjNiptuyvCnDIII89YWTZ0DTrEmkrwcyen7ZgM,15485
markdown/extensions/legacy_attrs.py,sha256=qx4d8c_mxt0JZ7wP9Sfskvi3cZN-OtDGTFCi4gapZ74,2547
markdown/extensions/legacy_em.py,sha256=8mtzOGYu_FXKO7DrBVr_5v5ZH6ru1yv1TiobYBEFV5Q,1582
markdown/extensions/md_in_html.py,sha256=F4CUIa2DjDPLEIuJCbmbw9jL1mbFloPhraedynZL9Ig,15829
markdown/extensions/meta.py,sha256=EUfkzM7l7UpH__Or9K3pl8ldVddwndlCZWA3d712RAE,2331
markdown/extensions/nl2br.py,sha256=wAqTNOuf2L1NzlEvEqoID70n9y-aiYaGLkuyQk3CD0w,783
markdown/extensions/sane_lists.py,sha256=ZQmCf-247KBexVG0fc62nDvokGkV6W1uavYbieNKSG4,1505
markdown/extensions/smarty.py,sha256=0padzkVCNACainKw-Xj1S5UfT0125VCTfNejmrCZItA,10238
markdown/extensions/tables.py,sha256=u8gmond8fQqJB4nVcljvDG5EvEueMPn_UaPEp5o9hF0,7685
markdown/extensions/toc.py,sha256=uOGElJ4K4-BAYGHdjwUT1Xv9iQ-PAgwI493NpyoRC9g,14136
markdown/extensions/wikilinks.py,sha256=GkgT9BY7b1-qW--dIwFAhC9V20RoeF13b7CFdw_V21Q,2812
markdown/htmlparser.py,sha256=Eios9Ui8L5IvdmlN8aTXioiTgOrrnUrxH2kPpLhgU0U,13033
markdown/inlinepatterns.py,sha256=nVzm5T01WvCMSyLeopmVwacvYYBvQw0Ac-Svj_TnudI,29775
markdown/pep562.py,sha256=5UkqT7sb-cQufgbOl_jF-RYUVVHS7VThzlMzR9vrd3I,8917
markdown/postprocessors.py,sha256=NeJyWBqPeDuBBJLTGs5Bfm5oTkUBXk9HWBeQy2_OldI,4262
markdown/preprocessors.py,sha256=-s8QGHGlX7JAIJTfCivuc-CVwTLWs0IyEU94YUT2IvQ,2742
markdown/serializers.py,sha256=_wQl-iJrPSUEQ4Q1owWYqN9qceVh6TOlAOH_i44BKAQ,6540
markdown/test_tools.py,sha256=Iht9NMNtDgtrMCNdjkMN9EWXjgsJjaFzJQVvzxo1Da0,8363
markdown/treeprocessors.py,sha256=MIdj6cv1YiIafGFk8wy-hhV3ZQfgvuIdMErwA7286Ns,15434
markdown/util.py,sha256=Xo8dhPcIwzYZ7RzzIGXoeC3Nq41aaP16B7JO076e28A,16063

View File

@ -0,0 +1,5 @@
Wheel-Version: 1.0
Generator: bdist_wheel (0.37.1)
Root-Is-Purelib: true
Tag: py3-none-any

View File

@ -0,0 +1,22 @@
[console_scripts]
markdown_py = markdown.__main__:run
[markdown.extensions]
abbr = markdown.extensions.abbr:AbbrExtension
admonition = markdown.extensions.admonition:AdmonitionExtension
attr_list = markdown.extensions.attr_list:AttrListExtension
codehilite = markdown.extensions.codehilite:CodeHiliteExtension
def_list = markdown.extensions.def_list:DefListExtension
extra = markdown.extensions.extra:ExtraExtension
fenced_code = markdown.extensions.fenced_code:FencedCodeExtension
footnotes = markdown.extensions.footnotes:FootnoteExtension
legacy_attrs = markdown.extensions.legacy_attrs:LegacyAttrExtension
legacy_em = markdown.extensions.legacy_em:LegacyEmExtension
md_in_html = markdown.extensions.md_in_html:MarkdownInHtmlExtension
meta = markdown.extensions.meta:MetaExtension
nl2br = markdown.extensions.nl2br:Nl2BrExtension
sane_lists = markdown.extensions.sane_lists:SaneListExtension
smarty = markdown.extensions.smarty:SmartyExtension
tables = markdown.extensions.tables:TableExtension
toc = markdown.extensions.toc:TocExtension
wikilinks = markdown.extensions.wikilinks:WikiLinkExtension

View File

@ -0,0 +1 @@
markdown

View File

@ -0,0 +1,21 @@
MIT License
Copyright (c) 2019 TAHRI Ahmed R.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

View File

@ -0,0 +1,269 @@
Metadata-Version: 2.1
Name: charset-normalizer
Version: 2.1.1
Summary: The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet.
Home-page: https://github.com/ousret/charset_normalizer
Author: Ahmed TAHRI @Ousret
Author-email: ahmed.tahri@cloudnursery.dev
License: MIT
Project-URL: Bug Reports, https://github.com/Ousret/charset_normalizer/issues
Project-URL: Documentation, https://charset-normalizer.readthedocs.io/en/latest
Keywords: encoding,i18n,txt,text,charset,charset-detector,normalization,unicode,chardet
Classifier: Development Status :: 5 - Production/Stable
Classifier: License :: OSI Approved :: MIT License
Classifier: Intended Audience :: Developers
Classifier: Topic :: Software Development :: Libraries :: Python Modules
Classifier: Operating System :: OS Independent
Classifier: Programming Language :: Python
Classifier: Programming Language :: Python :: 3
Classifier: Programming Language :: Python :: 3.6
Classifier: Programming Language :: Python :: 3.7
Classifier: Programming Language :: Python :: 3.8
Classifier: Programming Language :: Python :: 3.9
Classifier: Programming Language :: Python :: 3.10
Classifier: Programming Language :: Python :: 3.11
Classifier: Topic :: Text Processing :: Linguistic
Classifier: Topic :: Utilities
Classifier: Programming Language :: Python :: Implementation :: PyPy
Classifier: Typing :: Typed
Requires-Python: >=3.6.0
Description-Content-Type: text/markdown
License-File: LICENSE
Provides-Extra: unicode_backport
Requires-Dist: unicodedata2 ; extra == 'unicode_backport'
<h1 align="center">Charset Detection, for Everyone 👋 <a href="https://twitter.com/intent/tweet?text=The%20Real%20First%20Universal%20Charset%20%26%20Language%20Detector&url=https://www.github.com/Ousret/charset_normalizer&hashtags=python,encoding,chardet,developers"><img src="https://img.shields.io/twitter/url/http/shields.io.svg?style=social"/></a></h1>
<p align="center">
<sup>The Real First Universal Charset Detector</sup><br>
<a href="https://pypi.org/project/charset-normalizer">
<img src="https://img.shields.io/pypi/pyversions/charset_normalizer.svg?orange=blue" />
</a>
<a href="https://codecov.io/gh/Ousret/charset_normalizer">
<img src="https://codecov.io/gh/Ousret/charset_normalizer/branch/master/graph/badge.svg" />
</a>
<a href="https://pepy.tech/project/charset-normalizer/">
<img alt="Download Count Total" src="https://pepy.tech/badge/charset-normalizer/month" />
</a>
</p>
> A library that helps you read text from an unknown charset encoding.<br /> Motivated by `chardet`,
> I'm trying to resolve the issue by taking a new approach.
> All IANA character set names for which the Python core library provides codecs are supported.
<p align="center">
>>>>> <a href="https://charsetnormalizerweb.ousret.now.sh" target="_blank">👉 Try Me Online Now, Then Adopt Me 👈 </a> <<<<<
</p>
This project offers you an alternative to **Universal Charset Encoding Detector**, also known as **Chardet**.
| Feature | [Chardet](https://github.com/chardet/chardet) | Charset Normalizer | [cChardet](https://github.com/PyYoshi/cChardet) |
| ------------- | :-------------: | :------------------: | :------------------: |
| `Fast` | ❌<br> | ✅<br> | ✅ <br> |
| `Universal**` | ❌ | ✅ | ❌ |
| `Reliable` **without** distinguishable standards | ❌ | ✅ | ✅ |
| `Reliable` **with** distinguishable standards | ✅ | ✅ | ✅ |
| `License` | LGPL-2.1<br>_restrictive_ | MIT | MPL-1.1<br>_restrictive_ |
| `Native Python` | ✅ | ✅ | ❌ |
| `Detect spoken language` | ❌ | ✅ | N/A |
| `UnicodeDecodeError Safety` | ❌ | ✅ | ❌ |
| `Whl Size` | 193.6 kB | 39.5 kB | ~200 kB |
| `Supported Encoding` | 33 | :tada: [93](https://charset-normalizer.readthedocs.io/en/latest/user/support.html#supported-encodings) | 40
<p align="center">
<img src="https://i.imgflip.com/373iay.gif" alt="Reading Normalized Text" width="226"/><img src="https://media.tenor.com/images/c0180f70732a18b4965448d33adba3d0/tenor.gif" alt="Cat Reading Text" width="200"/>
*\*\* : They are clearly using specific code for a specific encoding even if covering most of used one*<br>
Did you got there because of the logs? See [https://charset-normalizer.readthedocs.io/en/latest/user/miscellaneous.html](https://charset-normalizer.readthedocs.io/en/latest/user/miscellaneous.html)
## ⭐ Your support
*Fork, test-it, star-it, submit your ideas! We do listen.*
## ⚡ Performance
This package offer better performance than its counterpart Chardet. Here are some numbers.
| Package | Accuracy | Mean per file (ms) | File per sec (est) |
| ------------- | :-------------: | :------------------: | :------------------: |
| [chardet](https://github.com/chardet/chardet) | 86 % | 200 ms | 5 file/sec |
| charset-normalizer | **98 %** | **39 ms** | 26 file/sec |
| Package | 99th percentile | 95th percentile | 50th percentile |
| ------------- | :-------------: | :------------------: | :------------------: |
| [chardet](https://github.com/chardet/chardet) | 1200 ms | 287 ms | 23 ms |
| charset-normalizer | 400 ms | 200 ms | 15 ms |
Chardet's performance on larger file (1MB+) are very poor. Expect huge difference on large payload.
> Stats are generated using 400+ files using default parameters. More details on used files, see GHA workflows.
> And yes, these results might change at any time. The dataset can be updated to include more files.
> The actual delays heavily depends on your CPU capabilities. The factors should remain the same.
> Keep in mind that the stats are generous and that Chardet accuracy vs our is measured using Chardet initial capability
> (eg. Supported Encoding) Challenge-them if you want.
[cchardet](https://github.com/PyYoshi/cChardet) is a non-native (cpp binding) and unmaintained faster alternative with
a better accuracy than chardet but lower than this package. If speed is the most important factor, you should try it.
## ✨ Installation
Using PyPi for latest stable
```sh
pip install charset-normalizer -U
```
If you want a more up-to-date `unicodedata` than the one available in your Python setup.
```sh
pip install charset-normalizer[unicode_backport] -U
```
## 🚀 Basic Usage
### CLI
This package comes with a CLI.
```
usage: normalizer [-h] [-v] [-a] [-n] [-m] [-r] [-f] [-t THRESHOLD]
file [file ...]
The Real First Universal Charset Detector. Discover originating encoding used
on text file. Normalize text to unicode.
positional arguments:
files File(s) to be analysed
optional arguments:
-h, --help show this help message and exit
-v, --verbose Display complementary information about file if any.
Stdout will contain logs about the detection process.
-a, --with-alternative
Output complementary possibilities if any. Top-level
JSON WILL be a list.
-n, --normalize Permit to normalize input file. If not set, program
does not write anything.
-m, --minimal Only output the charset detected to STDOUT. Disabling
JSON output.
-r, --replace Replace file when trying to normalize it instead of
creating a new one.
-f, --force Replace file without asking if you are sure, use this
flag with caution.
-t THRESHOLD, --threshold THRESHOLD
Define a custom maximum amount of chaos allowed in
decoded content. 0. <= chaos <= 1.
--version Show version information and exit.
```
```bash
normalizer ./data/sample.1.fr.srt
```
:tada: Since version 1.4.0 the CLI produce easily usable stdout result in JSON format.
```json
{
"path": "/home/default/projects/charset_normalizer/data/sample.1.fr.srt",
"encoding": "cp1252",
"encoding_aliases": [
"1252",
"windows_1252"
],
"alternative_encodings": [
"cp1254",
"cp1256",
"cp1258",
"iso8859_14",
"iso8859_15",
"iso8859_16",
"iso8859_3",
"iso8859_9",
"latin_1",
"mbcs"
],
"language": "French",
"alphabets": [
"Basic Latin",
"Latin-1 Supplement"
],
"has_sig_or_bom": false,
"chaos": 0.149,
"coherence": 97.152,
"unicode_path": null,
"is_preferred": true
}
```
### Python
*Just print out normalized text*
```python
from charset_normalizer import from_path
results = from_path('./my_subtitle.srt')
print(str(results.best()))
```
*Normalize any text file*
```python
from charset_normalizer import normalize
try:
normalize('./my_subtitle.srt') # should write to disk my_subtitle-***.srt
except IOError as e:
print('Sadly, we are unable to perform charset normalization.', str(e))
```
*Upgrade your code without effort*
```python
from charset_normalizer import detect
```
The above code will behave the same as **chardet**. We ensure that we offer the best (reasonable) BC result possible.
See the docs for advanced usage : [readthedocs.io](https://charset-normalizer.readthedocs.io/en/latest/)
## 😇 Why
When I started using Chardet, I noticed that it was not suited to my expectations, and I wanted to propose a
reliable alternative using a completely different method. Also! I never back down on a good challenge!
I **don't care** about the **originating charset** encoding, because **two different tables** can
produce **two identical rendered string.**
What I want is to get readable text, the best I can.
In a way, **I'm brute forcing text decoding.** How cool is that ? 😎
Don't confuse package **ftfy** with charset-normalizer or chardet. ftfy goal is to repair unicode string whereas charset-normalizer to convert raw file in unknown encoding to unicode.
## 🍰 How
- Discard all charset encoding table that could not fit the binary content.
- Measure chaos, or the mess once opened (by chunks) with a corresponding charset encoding.
- Extract matches with the lowest mess detected.
- Additionally, we measure coherence / probe for a language.
**Wait a minute**, what is chaos/mess and coherence according to **YOU ?**
*Chaos :* I opened hundred of text files, **written by humans**, with the wrong encoding table. **I observed**, then
**I established** some ground rules about **what is obvious** when **it seems like** a mess.
I know that my interpretation of what is chaotic is very subjective, feel free to contribute in order to
improve or rewrite it.
*Coherence :* For each language there is on earth, we have computed ranked letter appearance occurrences (the best we can). So I thought
that intel is worth something here. So I use those records against decoded text to check if I can detect intelligent design.
## ⚡ Known limitations
- Language detection is unreliable when text contains two or more languages sharing identical letters. (eg. HTML (english tags) + Turkish content (Sharing Latin characters))
- Every charset detector heavily depends on sufficient content. In common cases, do not bother run detection on very tiny content.
## 👤 Contributing
Contributions, issues and feature requests are very much welcome.<br />
Feel free to check [issues page](https://github.com/ousret/charset_normalizer/issues) if you want to contribute.
## 📝 License
Copyright © 2019 [Ahmed TAHRI @Ousret](https://github.com/Ousret).<br />
This project is [MIT](https://github.com/Ousret/charset_normalizer/blob/master/LICENSE) licensed.
Characters frequencies used in this project © 2012 [Denny Vrandečić](http://simia.net/letters/)

View File

@ -0,0 +1,33 @@
../../../bin/normalizer,sha256=b3q6GnKL6HKDTJiUkB-KSeBZ9IklrLVlcj-BQXVT6kY,290
charset_normalizer-2.1.1.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
charset_normalizer-2.1.1.dist-info/LICENSE,sha256=6zGgxaT7Cbik4yBV0lweX5w1iidS_vPNcgIT0cz-4kE,1070
charset_normalizer-2.1.1.dist-info/METADATA,sha256=C99l12g4d1E9_UiW-mqPCWx7v2M_lYGWxy1GTOjXSsA,11942
charset_normalizer-2.1.1.dist-info/RECORD,,
charset_normalizer-2.1.1.dist-info/WHEEL,sha256=G16H4A3IeoQmnOrYV4ueZGKSjhipXx8zc8nu9FGlvMA,92
charset_normalizer-2.1.1.dist-info/entry_points.txt,sha256=uYo8aIGLWv8YgWfSna5HnfY_En4pkF1w4bgawNAXzP0,76
charset_normalizer-2.1.1.dist-info/top_level.txt,sha256=7ASyzePr8_xuZWJsnqJjIBtyV8vhEo0wBCv1MPRRi3Q,19
charset_normalizer/__init__.py,sha256=jGhhf1IcOgCpZsr593E9fPvjWKnflVqHe_LwkOJjInU,1790
charset_normalizer/__pycache__/__init__.cpython-311.pyc,,
charset_normalizer/__pycache__/api.cpython-311.pyc,,
charset_normalizer/__pycache__/cd.cpython-311.pyc,,
charset_normalizer/__pycache__/constant.cpython-311.pyc,,
charset_normalizer/__pycache__/legacy.cpython-311.pyc,,
charset_normalizer/__pycache__/md.cpython-311.pyc,,
charset_normalizer/__pycache__/models.cpython-311.pyc,,
charset_normalizer/__pycache__/utils.cpython-311.pyc,,
charset_normalizer/__pycache__/version.cpython-311.pyc,,
charset_normalizer/api.py,sha256=euVPmjAMbjpqhEHPjfKtyy1mK52U0TOUBUQgM_Qy6eE,19191
charset_normalizer/assets/__init__.py,sha256=r7aakPaRIc2FFG2mw2V8NOTvkl25_euKZ3wPf5SAVa4,15222
charset_normalizer/assets/__pycache__/__init__.cpython-311.pyc,,
charset_normalizer/cd.py,sha256=Pxdkbn4cy0iZF42KTb1FiWIqqKobuz_fDjGwc6JMNBc,10811
charset_normalizer/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
charset_normalizer/cli/__pycache__/__init__.cpython-311.pyc,,
charset_normalizer/cli/__pycache__/normalizer.cpython-311.pyc,,
charset_normalizer/cli/normalizer.py,sha256=FmD1RXeMpRBg_mjR0MaJhNUpM2qZ8wz2neAE7AayBeg,9521
charset_normalizer/constant.py,sha256=NgU-pY8JH2a9lkVT8oKwAFmIUYNKOuSBwZgF9MrlNCM,19157
charset_normalizer/legacy.py,sha256=XKeZOts_HdYQU_Jb3C9ZfOjY2CiUL132k9_nXer8gig,3384
charset_normalizer/md.py,sha256=pZP8IVpSC82D8INA9Tf_y0ijJSRI-UIncZvLdfTWEd4,17642
charset_normalizer/models.py,sha256=i68YdlSLTEI3EEBVXq8TLNAbyyjrLC2OWszc-OBAk9I,13167
charset_normalizer/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
charset_normalizer/utils.py,sha256=ykOznhcAeH-ODLBWJuI7t1nbwa1SAfN_bDYTCJGyh4U,11771
charset_normalizer/version.py,sha256=_eh2MA3qS__IajlePQxKBmlw6zaBDvPYlLdEgxgIojw,79

View File

@ -0,0 +1,5 @@
Wheel-Version: 1.0
Generator: bdist_wheel (0.37.1)
Root-Is-Purelib: true
Tag: py3-none-any

View File

@ -0,0 +1,2 @@
[console_scripts]
normalizer = charset_normalizer.cli.normalizer:cli_detect

View File

@ -0,0 +1 @@
charset_normalizer

View File

@ -0,0 +1,56 @@
# -*- coding: utf-8 -*-
"""
Charset-Normalizer
~~~~~~~~~~~~~~
The Real First Universal Charset Detector.
A library that helps you read text from an unknown charset encoding.
Motivated by chardet, This package is trying to resolve the issue by taking a new approach.
All IANA character set names for which the Python core library provides codecs are supported.
Basic usage:
>>> from charset_normalizer import from_bytes
>>> results = from_bytes('Bсеки човек има право на образование. Oбразованието!'.encode('utf_8'))
>>> best_guess = results.best()
>>> str(best_guess)
'Bсеки човек има право на образование. Oбразованието!'
Others methods and usages are available - see the full documentation
at <https://github.com/Ousret/charset_normalizer>.
:copyright: (c) 2021 by Ahmed TAHRI
:license: MIT, see LICENSE for more details.
"""
import logging
from .api import from_bytes, from_fp, from_path, normalize
from .legacy import (
CharsetDetector,
CharsetDoctor,
CharsetNormalizerMatch,
CharsetNormalizerMatches,
detect,
)
from .models import CharsetMatch, CharsetMatches
from .utils import set_logging_handler
from .version import VERSION, __version__
__all__ = (
"from_fp",
"from_path",
"from_bytes",
"normalize",
"detect",
"CharsetMatch",
"CharsetMatches",
"CharsetNormalizerMatch",
"CharsetNormalizerMatches",
"CharsetDetector",
"CharsetDoctor",
"__version__",
"VERSION",
"set_logging_handler",
)
# Attach a NullHandler to the top level logger by default
# https://docs.python.org/3.3/howto/logging.html#configuring-logging-for-a-library
logging.getLogger("charset_normalizer").addHandler(logging.NullHandler())

View File

@ -0,0 +1,584 @@
import logging
import warnings
from os import PathLike
from os.path import basename, splitext
from typing import Any, BinaryIO, List, Optional, Set
from .cd import (
coherence_ratio,
encoding_languages,
mb_encoding_languages,
merge_coherence_ratios,
)
from .constant import IANA_SUPPORTED, TOO_BIG_SEQUENCE, TOO_SMALL_SEQUENCE, TRACE
from .md import mess_ratio
from .models import CharsetMatch, CharsetMatches
from .utils import (
any_specified_encoding,
cut_sequence_chunks,
iana_name,
identify_sig_or_bom,
is_cp_similar,
is_multi_byte_encoding,
should_strip_sig_or_bom,
)
# Will most likely be controversial
# logging.addLevelName(TRACE, "TRACE")
logger = logging.getLogger("charset_normalizer")
explain_handler = logging.StreamHandler()
explain_handler.setFormatter(
logging.Formatter("%(asctime)s | %(levelname)s | %(message)s")
)
def from_bytes(
sequences: bytes,
steps: int = 5,
chunk_size: int = 512,
threshold: float = 0.2,
cp_isolation: Optional[List[str]] = None,
cp_exclusion: Optional[List[str]] = None,
preemptive_behaviour: bool = True,
explain: bool = False,
) -> CharsetMatches:
"""
Given a raw bytes sequence, return the best possibles charset usable to render str objects.
If there is no results, it is a strong indicator that the source is binary/not text.
By default, the process will extract 5 blocs of 512o each to assess the mess and coherence of a given sequence.
And will give up a particular code page after 20% of measured mess. Those criteria are customizable at will.
The preemptive behavior DOES NOT replace the traditional detection workflow, it prioritize a particular code page
but never take it for granted. Can improve the performance.
You may want to focus your attention to some code page or/and not others, use cp_isolation and cp_exclusion for that
purpose.
This function will strip the SIG in the payload/sequence every time except on UTF-16, UTF-32.
By default the library does not setup any handler other than the NullHandler, if you choose to set the 'explain'
toggle to True it will alter the logger configuration to add a StreamHandler that is suitable for debugging.
Custom logging format and handler can be set manually.
"""
if not isinstance(sequences, (bytearray, bytes)):
raise TypeError(
"Expected object of type bytes or bytearray, got: {0}".format(
type(sequences)
)
)
if explain:
previous_logger_level: int = logger.level
logger.addHandler(explain_handler)
logger.setLevel(TRACE)
length: int = len(sequences)
if length == 0:
logger.debug("Encoding detection on empty bytes, assuming utf_8 intention.")
if explain:
logger.removeHandler(explain_handler)
logger.setLevel(previous_logger_level or logging.WARNING)
return CharsetMatches([CharsetMatch(sequences, "utf_8", 0.0, False, [], "")])
if cp_isolation is not None:
logger.log(
TRACE,
"cp_isolation is set. use this flag for debugging purpose. "
"limited list of encoding allowed : %s.",
", ".join(cp_isolation),
)
cp_isolation = [iana_name(cp, False) for cp in cp_isolation]
else:
cp_isolation = []
if cp_exclusion is not None:
logger.log(
TRACE,
"cp_exclusion is set. use this flag for debugging purpose. "
"limited list of encoding excluded : %s.",
", ".join(cp_exclusion),
)
cp_exclusion = [iana_name(cp, False) for cp in cp_exclusion]
else:
cp_exclusion = []
if length <= (chunk_size * steps):
logger.log(
TRACE,
"override steps (%i) and chunk_size (%i) as content does not fit (%i byte(s) given) parameters.",
steps,
chunk_size,
length,
)
steps = 1
chunk_size = length
if steps > 1 and length / steps < chunk_size:
chunk_size = int(length / steps)
is_too_small_sequence: bool = len(sequences) < TOO_SMALL_SEQUENCE
is_too_large_sequence: bool = len(sequences) >= TOO_BIG_SEQUENCE
if is_too_small_sequence:
logger.log(
TRACE,
"Trying to detect encoding from a tiny portion of ({}) byte(s).".format(
length
),
)
elif is_too_large_sequence:
logger.log(
TRACE,
"Using lazy str decoding because the payload is quite large, ({}) byte(s).".format(
length
),
)
prioritized_encodings: List[str] = []
specified_encoding: Optional[str] = (
any_specified_encoding(sequences) if preemptive_behaviour else None
)
if specified_encoding is not None:
prioritized_encodings.append(specified_encoding)
logger.log(
TRACE,
"Detected declarative mark in sequence. Priority +1 given for %s.",
specified_encoding,
)
tested: Set[str] = set()
tested_but_hard_failure: List[str] = []
tested_but_soft_failure: List[str] = []
fallback_ascii: Optional[CharsetMatch] = None
fallback_u8: Optional[CharsetMatch] = None
fallback_specified: Optional[CharsetMatch] = None
results: CharsetMatches = CharsetMatches()
sig_encoding, sig_payload = identify_sig_or_bom(sequences)
if sig_encoding is not None:
prioritized_encodings.append(sig_encoding)
logger.log(
TRACE,
"Detected a SIG or BOM mark on first %i byte(s). Priority +1 given for %s.",
len(sig_payload),
sig_encoding,
)
prioritized_encodings.append("ascii")
if "utf_8" not in prioritized_encodings:
prioritized_encodings.append("utf_8")
for encoding_iana in prioritized_encodings + IANA_SUPPORTED:
if cp_isolation and encoding_iana not in cp_isolation:
continue
if cp_exclusion and encoding_iana in cp_exclusion:
continue
if encoding_iana in tested:
continue
tested.add(encoding_iana)
decoded_payload: Optional[str] = None
bom_or_sig_available: bool = sig_encoding == encoding_iana
strip_sig_or_bom: bool = bom_or_sig_available and should_strip_sig_or_bom(
encoding_iana
)
if encoding_iana in {"utf_16", "utf_32"} and not bom_or_sig_available:
logger.log(
TRACE,
"Encoding %s wont be tested as-is because it require a BOM. Will try some sub-encoder LE/BE.",
encoding_iana,
)
continue
try:
is_multi_byte_decoder: bool = is_multi_byte_encoding(encoding_iana)
except (ModuleNotFoundError, ImportError):
logger.log(
TRACE,
"Encoding %s does not provide an IncrementalDecoder",
encoding_iana,
)
continue
try:
if is_too_large_sequence and is_multi_byte_decoder is False:
str(
sequences[: int(50e4)]
if strip_sig_or_bom is False
else sequences[len(sig_payload) : int(50e4)],
encoding=encoding_iana,
)
else:
decoded_payload = str(
sequences
if strip_sig_or_bom is False
else sequences[len(sig_payload) :],
encoding=encoding_iana,
)
except (UnicodeDecodeError, LookupError) as e:
if not isinstance(e, LookupError):
logger.log(
TRACE,
"Code page %s does not fit given bytes sequence at ALL. %s",
encoding_iana,
str(e),
)
tested_but_hard_failure.append(encoding_iana)
continue
similar_soft_failure_test: bool = False
for encoding_soft_failed in tested_but_soft_failure:
if is_cp_similar(encoding_iana, encoding_soft_failed):
similar_soft_failure_test = True
break
if similar_soft_failure_test:
logger.log(
TRACE,
"%s is deemed too similar to code page %s and was consider unsuited already. Continuing!",
encoding_iana,
encoding_soft_failed,
)
continue
r_ = range(
0 if not bom_or_sig_available else len(sig_payload),
length,
int(length / steps),
)
multi_byte_bonus: bool = (
is_multi_byte_decoder
and decoded_payload is not None
and len(decoded_payload) < length
)
if multi_byte_bonus:
logger.log(
TRACE,
"Code page %s is a multi byte encoding table and it appear that at least one character "
"was encoded using n-bytes.",
encoding_iana,
)
max_chunk_gave_up: int = int(len(r_) / 4)
max_chunk_gave_up = max(max_chunk_gave_up, 2)
early_stop_count: int = 0
lazy_str_hard_failure = False
md_chunks: List[str] = []
md_ratios = []
try:
for chunk in cut_sequence_chunks(
sequences,
encoding_iana,
r_,
chunk_size,
bom_or_sig_available,
strip_sig_or_bom,
sig_payload,
is_multi_byte_decoder,
decoded_payload,
):
md_chunks.append(chunk)
md_ratios.append(mess_ratio(chunk, threshold))
if md_ratios[-1] >= threshold:
early_stop_count += 1
if (early_stop_count >= max_chunk_gave_up) or (
bom_or_sig_available and strip_sig_or_bom is False
):
break
except UnicodeDecodeError as e: # Lazy str loading may have missed something there
logger.log(
TRACE,
"LazyStr Loading: After MD chunk decode, code page %s does not fit given bytes sequence at ALL. %s",
encoding_iana,
str(e),
)
early_stop_count = max_chunk_gave_up
lazy_str_hard_failure = True
# We might want to check the sequence again with the whole content
# Only if initial MD tests passes
if (
not lazy_str_hard_failure
and is_too_large_sequence
and not is_multi_byte_decoder
):
try:
sequences[int(50e3) :].decode(encoding_iana, errors="strict")
except UnicodeDecodeError as e:
logger.log(
TRACE,
"LazyStr Loading: After final lookup, code page %s does not fit given bytes sequence at ALL. %s",
encoding_iana,
str(e),
)
tested_but_hard_failure.append(encoding_iana)
continue
mean_mess_ratio: float = sum(md_ratios) / len(md_ratios) if md_ratios else 0.0
if mean_mess_ratio >= threshold or early_stop_count >= max_chunk_gave_up:
tested_but_soft_failure.append(encoding_iana)
logger.log(
TRACE,
"%s was excluded because of initial chaos probing. Gave up %i time(s). "
"Computed mean chaos is %f %%.",
encoding_iana,
early_stop_count,
round(mean_mess_ratio * 100, ndigits=3),
)
# Preparing those fallbacks in case we got nothing.
if (
encoding_iana in ["ascii", "utf_8", specified_encoding]
and not lazy_str_hard_failure
):
fallback_entry = CharsetMatch(
sequences, encoding_iana, threshold, False, [], decoded_payload
)
if encoding_iana == specified_encoding:
fallback_specified = fallback_entry
elif encoding_iana == "ascii":
fallback_ascii = fallback_entry
else:
fallback_u8 = fallback_entry
continue
logger.log(
TRACE,
"%s passed initial chaos probing. Mean measured chaos is %f %%",
encoding_iana,
round(mean_mess_ratio * 100, ndigits=3),
)
if not is_multi_byte_decoder:
target_languages: List[str] = encoding_languages(encoding_iana)
else:
target_languages = mb_encoding_languages(encoding_iana)
if target_languages:
logger.log(
TRACE,
"{} should target any language(s) of {}".format(
encoding_iana, str(target_languages)
),
)
cd_ratios = []
# We shall skip the CD when its about ASCII
# Most of the time its not relevant to run "language-detection" on it.
if encoding_iana != "ascii":
for chunk in md_chunks:
chunk_languages = coherence_ratio(
chunk, 0.1, ",".join(target_languages) if target_languages else None
)
cd_ratios.append(chunk_languages)
cd_ratios_merged = merge_coherence_ratios(cd_ratios)
if cd_ratios_merged:
logger.log(
TRACE,
"We detected language {} using {}".format(
cd_ratios_merged, encoding_iana
),
)
results.append(
CharsetMatch(
sequences,
encoding_iana,
mean_mess_ratio,
bom_or_sig_available,
cd_ratios_merged,
decoded_payload,
)
)
if (
encoding_iana in [specified_encoding, "ascii", "utf_8"]
and mean_mess_ratio < 0.1
):
logger.debug(
"Encoding detection: %s is most likely the one.", encoding_iana
)
if explain:
logger.removeHandler(explain_handler)
logger.setLevel(previous_logger_level)
return CharsetMatches([results[encoding_iana]])
if encoding_iana == sig_encoding:
logger.debug(
"Encoding detection: %s is most likely the one as we detected a BOM or SIG within "
"the beginning of the sequence.",
encoding_iana,
)
if explain:
logger.removeHandler(explain_handler)
logger.setLevel(previous_logger_level)
return CharsetMatches([results[encoding_iana]])
if len(results) == 0:
if fallback_u8 or fallback_ascii or fallback_specified:
logger.log(
TRACE,
"Nothing got out of the detection process. Using ASCII/UTF-8/Specified fallback.",
)
if fallback_specified:
logger.debug(
"Encoding detection: %s will be used as a fallback match",
fallback_specified.encoding,
)
results.append(fallback_specified)
elif (
(fallback_u8 and fallback_ascii is None)
or (
fallback_u8
and fallback_ascii
and fallback_u8.fingerprint != fallback_ascii.fingerprint
)
or (fallback_u8 is not None)
):
logger.debug("Encoding detection: utf_8 will be used as a fallback match")
results.append(fallback_u8)
elif fallback_ascii:
logger.debug("Encoding detection: ascii will be used as a fallback match")
results.append(fallback_ascii)
if results:
logger.debug(
"Encoding detection: Found %s as plausible (best-candidate) for content. With %i alternatives.",
results.best().encoding, # type: ignore
len(results) - 1,
)
else:
logger.debug("Encoding detection: Unable to determine any suitable charset.")
if explain:
logger.removeHandler(explain_handler)
logger.setLevel(previous_logger_level)
return results
def from_fp(
fp: BinaryIO,
steps: int = 5,
chunk_size: int = 512,
threshold: float = 0.20,
cp_isolation: Optional[List[str]] = None,
cp_exclusion: Optional[List[str]] = None,
preemptive_behaviour: bool = True,
explain: bool = False,
) -> CharsetMatches:
"""
Same thing than the function from_bytes but using a file pointer that is already ready.
Will not close the file pointer.
"""
return from_bytes(
fp.read(),
steps,
chunk_size,
threshold,
cp_isolation,
cp_exclusion,
preemptive_behaviour,
explain,
)
def from_path(
path: "PathLike[Any]",
steps: int = 5,
chunk_size: int = 512,
threshold: float = 0.20,
cp_isolation: Optional[List[str]] = None,
cp_exclusion: Optional[List[str]] = None,
preemptive_behaviour: bool = True,
explain: bool = False,
) -> CharsetMatches:
"""
Same thing than the function from_bytes but with one extra step. Opening and reading given file path in binary mode.
Can raise IOError.
"""
with open(path, "rb") as fp:
return from_fp(
fp,
steps,
chunk_size,
threshold,
cp_isolation,
cp_exclusion,
preemptive_behaviour,
explain,
)
def normalize(
path: "PathLike[Any]",
steps: int = 5,
chunk_size: int = 512,
threshold: float = 0.20,
cp_isolation: Optional[List[str]] = None,
cp_exclusion: Optional[List[str]] = None,
preemptive_behaviour: bool = True,
) -> CharsetMatch:
"""
Take a (text-based) file path and try to create another file next to it, this time using UTF-8.
"""
warnings.warn(
"normalize is deprecated and will be removed in 3.0",
DeprecationWarning,
)
results = from_path(
path,
steps,
chunk_size,
threshold,
cp_isolation,
cp_exclusion,
preemptive_behaviour,
)
filename = basename(path)
target_extensions = list(splitext(filename))
if len(results) == 0:
raise IOError(
'Unable to normalize "{}", no encoding charset seems to fit.'.format(
filename
)
)
result = results.best()
target_extensions[0] += "-" + result.encoding # type: ignore
with open(
"{}".format(str(path).replace(filename, "".join(target_extensions))), "wb"
) as fp:
fp.write(result.output()) # type: ignore
return result # type: ignore

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,339 @@
import importlib
from codecs import IncrementalDecoder
from collections import Counter
from functools import lru_cache
from typing import Counter as TypeCounter, Dict, List, Optional, Tuple
from .assets import FREQUENCIES
from .constant import KO_NAMES, LANGUAGE_SUPPORTED_COUNT, TOO_SMALL_SEQUENCE, ZH_NAMES
from .md import is_suspiciously_successive_range
from .models import CoherenceMatches
from .utils import (
is_accentuated,
is_latin,
is_multi_byte_encoding,
is_unicode_range_secondary,
unicode_range,
)
def encoding_unicode_range(iana_name: str) -> List[str]:
"""
Return associated unicode ranges in a single byte code page.
"""
if is_multi_byte_encoding(iana_name):
raise IOError("Function not supported on multi-byte code page")
decoder = importlib.import_module(
"encodings.{}".format(iana_name)
).IncrementalDecoder
p: IncrementalDecoder = decoder(errors="ignore")
seen_ranges: Dict[str, int] = {}
character_count: int = 0
for i in range(0x40, 0xFF):
chunk: str = p.decode(bytes([i]))
if chunk:
character_range: Optional[str] = unicode_range(chunk)
if character_range is None:
continue
if is_unicode_range_secondary(character_range) is False:
if character_range not in seen_ranges:
seen_ranges[character_range] = 0
seen_ranges[character_range] += 1
character_count += 1
return sorted(
[
character_range
for character_range in seen_ranges
if seen_ranges[character_range] / character_count >= 0.15
]
)
def unicode_range_languages(primary_range: str) -> List[str]:
"""
Return inferred languages used with a unicode range.
"""
languages: List[str] = []
for language, characters in FREQUENCIES.items():
for character in characters:
if unicode_range(character) == primary_range:
languages.append(language)
break
return languages
@lru_cache()
def encoding_languages(iana_name: str) -> List[str]:
"""
Single-byte encoding language association. Some code page are heavily linked to particular language(s).
This function does the correspondence.
"""
unicode_ranges: List[str] = encoding_unicode_range(iana_name)
primary_range: Optional[str] = None
for specified_range in unicode_ranges:
if "Latin" not in specified_range:
primary_range = specified_range
break
if primary_range is None:
return ["Latin Based"]
return unicode_range_languages(primary_range)
@lru_cache()
def mb_encoding_languages(iana_name: str) -> List[str]:
"""
Multi-byte encoding language association. Some code page are heavily linked to particular language(s).
This function does the correspondence.
"""
if (
iana_name.startswith("shift_")
or iana_name.startswith("iso2022_jp")
or iana_name.startswith("euc_j")
or iana_name == "cp932"
):
return ["Japanese"]
if iana_name.startswith("gb") or iana_name in ZH_NAMES:
return ["Chinese", "Classical Chinese"]
if iana_name.startswith("iso2022_kr") or iana_name in KO_NAMES:
return ["Korean"]
return []
@lru_cache(maxsize=LANGUAGE_SUPPORTED_COUNT)
def get_target_features(language: str) -> Tuple[bool, bool]:
"""
Determine main aspects from a supported language if it contains accents and if is pure Latin.
"""
target_have_accents: bool = False
target_pure_latin: bool = True
for character in FREQUENCIES[language]:
if not target_have_accents and is_accentuated(character):
target_have_accents = True
if target_pure_latin and is_latin(character) is False:
target_pure_latin = False
return target_have_accents, target_pure_latin
def alphabet_languages(
characters: List[str], ignore_non_latin: bool = False
) -> List[str]:
"""
Return associated languages associated to given characters.
"""
languages: List[Tuple[str, float]] = []
source_have_accents = any(is_accentuated(character) for character in characters)
for language, language_characters in FREQUENCIES.items():
target_have_accents, target_pure_latin = get_target_features(language)
if ignore_non_latin and target_pure_latin is False:
continue
if target_have_accents is False and source_have_accents:
continue
character_count: int = len(language_characters)
character_match_count: int = len(
[c for c in language_characters if c in characters]
)
ratio: float = character_match_count / character_count
if ratio >= 0.2:
languages.append((language, ratio))
languages = sorted(languages, key=lambda x: x[1], reverse=True)
return [compatible_language[0] for compatible_language in languages]
def characters_popularity_compare(
language: str, ordered_characters: List[str]
) -> float:
"""
Determine if a ordered characters list (by occurrence from most appearance to rarest) match a particular language.
The result is a ratio between 0. (absolutely no correspondence) and 1. (near perfect fit).
Beware that is function is not strict on the match in order to ease the detection. (Meaning close match is 1.)
"""
if language not in FREQUENCIES:
raise ValueError("{} not available".format(language))
character_approved_count: int = 0
FREQUENCIES_language_set = set(FREQUENCIES[language])
for character in ordered_characters:
if character not in FREQUENCIES_language_set:
continue
characters_before_source: List[str] = FREQUENCIES[language][
0 : FREQUENCIES[language].index(character)
]
characters_after_source: List[str] = FREQUENCIES[language][
FREQUENCIES[language].index(character) :
]
characters_before: List[str] = ordered_characters[
0 : ordered_characters.index(character)
]
characters_after: List[str] = ordered_characters[
ordered_characters.index(character) :
]
before_match_count: int = len(
set(characters_before) & set(characters_before_source)
)
after_match_count: int = len(
set(characters_after) & set(characters_after_source)
)
if len(characters_before_source) == 0 and before_match_count <= 4:
character_approved_count += 1
continue
if len(characters_after_source) == 0 and after_match_count <= 4:
character_approved_count += 1
continue
if (
before_match_count / len(characters_before_source) >= 0.4
or after_match_count / len(characters_after_source) >= 0.4
):
character_approved_count += 1
continue
return character_approved_count / len(ordered_characters)
def alpha_unicode_split(decoded_sequence: str) -> List[str]:
"""
Given a decoded text sequence, return a list of str. Unicode range / alphabet separation.
Ex. a text containing English/Latin with a bit a Hebrew will return two items in the resulting list;
One containing the latin letters and the other hebrew.
"""
layers: Dict[str, str] = {}
for character in decoded_sequence:
if character.isalpha() is False:
continue
character_range: Optional[str] = unicode_range(character)
if character_range is None:
continue
layer_target_range: Optional[str] = None
for discovered_range in layers:
if (
is_suspiciously_successive_range(discovered_range, character_range)
is False
):
layer_target_range = discovered_range
break
if layer_target_range is None:
layer_target_range = character_range
if layer_target_range not in layers:
layers[layer_target_range] = character.lower()
continue
layers[layer_target_range] += character.lower()
return list(layers.values())
def merge_coherence_ratios(results: List[CoherenceMatches]) -> CoherenceMatches:
"""
This function merge results previously given by the function coherence_ratio.
The return type is the same as coherence_ratio.
"""
per_language_ratios: Dict[str, List[float]] = {}
for result in results:
for sub_result in result:
language, ratio = sub_result
if language not in per_language_ratios:
per_language_ratios[language] = [ratio]
continue
per_language_ratios[language].append(ratio)
merge = [
(
language,
round(
sum(per_language_ratios[language]) / len(per_language_ratios[language]),
4,
),
)
for language in per_language_ratios
]
return sorted(merge, key=lambda x: x[1], reverse=True)
@lru_cache(maxsize=2048)
def coherence_ratio(
decoded_sequence: str, threshold: float = 0.1, lg_inclusion: Optional[str] = None
) -> CoherenceMatches:
"""
Detect ANY language that can be identified in given sequence. The sequence will be analysed by layers.
A layer = Character extraction by alphabets/ranges.
"""
results: List[Tuple[str, float]] = []
ignore_non_latin: bool = False
sufficient_match_count: int = 0
lg_inclusion_list = lg_inclusion.split(",") if lg_inclusion is not None else []
if "Latin Based" in lg_inclusion_list:
ignore_non_latin = True
lg_inclusion_list.remove("Latin Based")
for layer in alpha_unicode_split(decoded_sequence):
sequence_frequencies: TypeCounter[str] = Counter(layer)
most_common = sequence_frequencies.most_common()
character_count: int = sum(o for c, o in most_common)
if character_count <= TOO_SMALL_SEQUENCE:
continue
popular_character_ordered: List[str] = [c for c, o in most_common]
for language in lg_inclusion_list or alphabet_languages(
popular_character_ordered, ignore_non_latin
):
ratio: float = characters_popularity_compare(
language, popular_character_ordered
)
if ratio < threshold:
continue
elif ratio >= 0.8:
sufficient_match_count += 1
results.append((language, round(ratio, 4)))
if sufficient_match_count >= 3:
break
return sorted(results, key=lambda x: x[1], reverse=True)

View File

@ -0,0 +1,295 @@
import argparse
import sys
from json import dumps
from os.path import abspath
from platform import python_version
from typing import List, Optional
try:
from unicodedata2 import unidata_version
except ImportError:
from unicodedata import unidata_version
from charset_normalizer import from_fp
from charset_normalizer.models import CliDetectionResult
from charset_normalizer.version import __version__
def query_yes_no(question: str, default: str = "yes") -> bool:
"""Ask a yes/no question via input() and return their answer.
"question" is a string that is presented to the user.
"default" is the presumed answer if the user just hits <Enter>.
It must be "yes" (the default), "no" or None (meaning
an answer is required of the user).
The "answer" return value is True for "yes" or False for "no".
Credit goes to (c) https://stackoverflow.com/questions/3041986/apt-command-line-interface-like-yes-no-input
"""
valid = {"yes": True, "y": True, "ye": True, "no": False, "n": False}
if default is None:
prompt = " [y/n] "
elif default == "yes":
prompt = " [Y/n] "
elif default == "no":
prompt = " [y/N] "
else:
raise ValueError("invalid default answer: '%s'" % default)
while True:
sys.stdout.write(question + prompt)
choice = input().lower()
if default is not None and choice == "":
return valid[default]
elif choice in valid:
return valid[choice]
else:
sys.stdout.write("Please respond with 'yes' or 'no' " "(or 'y' or 'n').\n")
def cli_detect(argv: Optional[List[str]] = None) -> int:
"""
CLI assistant using ARGV and ArgumentParser
:param argv:
:return: 0 if everything is fine, anything else equal trouble
"""
parser = argparse.ArgumentParser(
description="The Real First Universal Charset Detector. "
"Discover originating encoding used on text file. "
"Normalize text to unicode."
)
parser.add_argument(
"files", type=argparse.FileType("rb"), nargs="+", help="File(s) to be analysed"
)
parser.add_argument(
"-v",
"--verbose",
action="store_true",
default=False,
dest="verbose",
help="Display complementary information about file if any. "
"Stdout will contain logs about the detection process.",
)
parser.add_argument(
"-a",
"--with-alternative",
action="store_true",
default=False,
dest="alternatives",
help="Output complementary possibilities if any. Top-level JSON WILL be a list.",
)
parser.add_argument(
"-n",
"--normalize",
action="store_true",
default=False,
dest="normalize",
help="Permit to normalize input file. If not set, program does not write anything.",
)
parser.add_argument(
"-m",
"--minimal",
action="store_true",
default=False,
dest="minimal",
help="Only output the charset detected to STDOUT. Disabling JSON output.",
)
parser.add_argument(
"-r",
"--replace",
action="store_true",
default=False,
dest="replace",
help="Replace file when trying to normalize it instead of creating a new one.",
)
parser.add_argument(
"-f",
"--force",
action="store_true",
default=False,
dest="force",
help="Replace file without asking if you are sure, use this flag with caution.",
)
parser.add_argument(
"-t",
"--threshold",
action="store",
default=0.2,
type=float,
dest="threshold",
help="Define a custom maximum amount of chaos allowed in decoded content. 0. <= chaos <= 1.",
)
parser.add_argument(
"--version",
action="version",
version="Charset-Normalizer {} - Python {} - Unicode {}".format(
__version__, python_version(), unidata_version
),
help="Show version information and exit.",
)
args = parser.parse_args(argv)
if args.replace is True and args.normalize is False:
print("Use --replace in addition of --normalize only.", file=sys.stderr)
return 1
if args.force is True and args.replace is False:
print("Use --force in addition of --replace only.", file=sys.stderr)
return 1
if args.threshold < 0.0 or args.threshold > 1.0:
print("--threshold VALUE should be between 0. AND 1.", file=sys.stderr)
return 1
x_ = []
for my_file in args.files:
matches = from_fp(my_file, threshold=args.threshold, explain=args.verbose)
best_guess = matches.best()
if best_guess is None:
print(
'Unable to identify originating encoding for "{}". {}'.format(
my_file.name,
"Maybe try increasing maximum amount of chaos."
if args.threshold < 1.0
else "",
),
file=sys.stderr,
)
x_.append(
CliDetectionResult(
abspath(my_file.name),
None,
[],
[],
"Unknown",
[],
False,
1.0,
0.0,
None,
True,
)
)
else:
x_.append(
CliDetectionResult(
abspath(my_file.name),
best_guess.encoding,
best_guess.encoding_aliases,
[
cp
for cp in best_guess.could_be_from_charset
if cp != best_guess.encoding
],
best_guess.language,
best_guess.alphabets,
best_guess.bom,
best_guess.percent_chaos,
best_guess.percent_coherence,
None,
True,
)
)
if len(matches) > 1 and args.alternatives:
for el in matches:
if el != best_guess:
x_.append(
CliDetectionResult(
abspath(my_file.name),
el.encoding,
el.encoding_aliases,
[
cp
for cp in el.could_be_from_charset
if cp != el.encoding
],
el.language,
el.alphabets,
el.bom,
el.percent_chaos,
el.percent_coherence,
None,
False,
)
)
if args.normalize is True:
if best_guess.encoding.startswith("utf") is True:
print(
'"{}" file does not need to be normalized, as it already came from unicode.'.format(
my_file.name
),
file=sys.stderr,
)
if my_file.closed is False:
my_file.close()
continue
o_: List[str] = my_file.name.split(".")
if args.replace is False:
o_.insert(-1, best_guess.encoding)
if my_file.closed is False:
my_file.close()
elif (
args.force is False
and query_yes_no(
'Are you sure to normalize "{}" by replacing it ?'.format(
my_file.name
),
"no",
)
is False
):
if my_file.closed is False:
my_file.close()
continue
try:
x_[0].unicode_path = abspath("./{}".format(".".join(o_)))
with open(x_[0].unicode_path, "w", encoding="utf-8") as fp:
fp.write(str(best_guess))
except IOError as e:
print(str(e), file=sys.stderr)
if my_file.closed is False:
my_file.close()
return 2
if my_file.closed is False:
my_file.close()
if args.minimal is False:
print(
dumps(
[el.__dict__ for el in x_] if len(x_) > 1 else x_[0].__dict__,
ensure_ascii=True,
indent=4,
)
)
else:
for my_file in args.files:
print(
", ".join(
[
el.encoding or "undefined"
for el in x_
if el.path == abspath(my_file.name)
]
)
)
return 0
if __name__ == "__main__":
cli_detect()

View File

@ -0,0 +1,497 @@
from codecs import BOM_UTF8, BOM_UTF16_BE, BOM_UTF16_LE, BOM_UTF32_BE, BOM_UTF32_LE
from encodings.aliases import aliases
from re import IGNORECASE, compile as re_compile
from typing import Dict, List, Set, Union
from .assets import FREQUENCIES
# Contain for each eligible encoding a list of/item bytes SIG/BOM
ENCODING_MARKS: Dict[str, Union[bytes, List[bytes]]] = {
"utf_8": BOM_UTF8,
"utf_7": [
b"\x2b\x2f\x76\x38",
b"\x2b\x2f\x76\x39",
b"\x2b\x2f\x76\x2b",
b"\x2b\x2f\x76\x2f",
b"\x2b\x2f\x76\x38\x2d",
],
"gb18030": b"\x84\x31\x95\x33",
"utf_32": [BOM_UTF32_BE, BOM_UTF32_LE],
"utf_16": [BOM_UTF16_BE, BOM_UTF16_LE],
}
TOO_SMALL_SEQUENCE: int = 32
TOO_BIG_SEQUENCE: int = int(10e6)
UTF8_MAXIMAL_ALLOCATION: int = 1112064
UNICODE_RANGES_COMBINED: Dict[str, range] = {
"Control character": range(31 + 1),
"Basic Latin": range(32, 127 + 1),
"Latin-1 Supplement": range(128, 255 + 1),
"Latin Extended-A": range(256, 383 + 1),
"Latin Extended-B": range(384, 591 + 1),
"IPA Extensions": range(592, 687 + 1),
"Spacing Modifier Letters": range(688, 767 + 1),
"Combining Diacritical Marks": range(768, 879 + 1),
"Greek and Coptic": range(880, 1023 + 1),
"Cyrillic": range(1024, 1279 + 1),
"Cyrillic Supplement": range(1280, 1327 + 1),
"Armenian": range(1328, 1423 + 1),
"Hebrew": range(1424, 1535 + 1),
"Arabic": range(1536, 1791 + 1),
"Syriac": range(1792, 1871 + 1),
"Arabic Supplement": range(1872, 1919 + 1),
"Thaana": range(1920, 1983 + 1),
"NKo": range(1984, 2047 + 1),
"Samaritan": range(2048, 2111 + 1),
"Mandaic": range(2112, 2143 + 1),
"Syriac Supplement": range(2144, 2159 + 1),
"Arabic Extended-A": range(2208, 2303 + 1),
"Devanagari": range(2304, 2431 + 1),
"Bengali": range(2432, 2559 + 1),
"Gurmukhi": range(2560, 2687 + 1),
"Gujarati": range(2688, 2815 + 1),
"Oriya": range(2816, 2943 + 1),
"Tamil": range(2944, 3071 + 1),
"Telugu": range(3072, 3199 + 1),
"Kannada": range(3200, 3327 + 1),
"Malayalam": range(3328, 3455 + 1),
"Sinhala": range(3456, 3583 + 1),
"Thai": range(3584, 3711 + 1),
"Lao": range(3712, 3839 + 1),
"Tibetan": range(3840, 4095 + 1),
"Myanmar": range(4096, 4255 + 1),
"Georgian": range(4256, 4351 + 1),
"Hangul Jamo": range(4352, 4607 + 1),
"Ethiopic": range(4608, 4991 + 1),
"Ethiopic Supplement": range(4992, 5023 + 1),
"Cherokee": range(5024, 5119 + 1),
"Unified Canadian Aboriginal Syllabics": range(5120, 5759 + 1),
"Ogham": range(5760, 5791 + 1),
"Runic": range(5792, 5887 + 1),
"Tagalog": range(5888, 5919 + 1),
"Hanunoo": range(5920, 5951 + 1),
"Buhid": range(5952, 5983 + 1),
"Tagbanwa": range(5984, 6015 + 1),
"Khmer": range(6016, 6143 + 1),
"Mongolian": range(6144, 6319 + 1),
"Unified Canadian Aboriginal Syllabics Extended": range(6320, 6399 + 1),
"Limbu": range(6400, 6479 + 1),
"Tai Le": range(6480, 6527 + 1),
"New Tai Lue": range(6528, 6623 + 1),
"Khmer Symbols": range(6624, 6655 + 1),
"Buginese": range(6656, 6687 + 1),
"Tai Tham": range(6688, 6831 + 1),
"Combining Diacritical Marks Extended": range(6832, 6911 + 1),
"Balinese": range(6912, 7039 + 1),
"Sundanese": range(7040, 7103 + 1),
"Batak": range(7104, 7167 + 1),
"Lepcha": range(7168, 7247 + 1),
"Ol Chiki": range(7248, 7295 + 1),
"Cyrillic Extended C": range(7296, 7311 + 1),
"Sundanese Supplement": range(7360, 7375 + 1),
"Vedic Extensions": range(7376, 7423 + 1),
"Phonetic Extensions": range(7424, 7551 + 1),
"Phonetic Extensions Supplement": range(7552, 7615 + 1),
"Combining Diacritical Marks Supplement": range(7616, 7679 + 1),
"Latin Extended Additional": range(7680, 7935 + 1),
"Greek Extended": range(7936, 8191 + 1),
"General Punctuation": range(8192, 8303 + 1),
"Superscripts and Subscripts": range(8304, 8351 + 1),
"Currency Symbols": range(8352, 8399 + 1),
"Combining Diacritical Marks for Symbols": range(8400, 8447 + 1),
"Letterlike Symbols": range(8448, 8527 + 1),
"Number Forms": range(8528, 8591 + 1),
"Arrows": range(8592, 8703 + 1),
"Mathematical Operators": range(8704, 8959 + 1),
"Miscellaneous Technical": range(8960, 9215 + 1),
"Control Pictures": range(9216, 9279 + 1),
"Optical Character Recognition": range(9280, 9311 + 1),
"Enclosed Alphanumerics": range(9312, 9471 + 1),
"Box Drawing": range(9472, 9599 + 1),
"Block Elements": range(9600, 9631 + 1),
"Geometric Shapes": range(9632, 9727 + 1),
"Miscellaneous Symbols": range(9728, 9983 + 1),
"Dingbats": range(9984, 10175 + 1),
"Miscellaneous Mathematical Symbols-A": range(10176, 10223 + 1),
"Supplemental Arrows-A": range(10224, 10239 + 1),
"Braille Patterns": range(10240, 10495 + 1),
"Supplemental Arrows-B": range(10496, 10623 + 1),
"Miscellaneous Mathematical Symbols-B": range(10624, 10751 + 1),
"Supplemental Mathematical Operators": range(10752, 11007 + 1),
"Miscellaneous Symbols and Arrows": range(11008, 11263 + 1),
"Glagolitic": range(11264, 11359 + 1),
"Latin Extended-C": range(11360, 11391 + 1),
"Coptic": range(11392, 11519 + 1),
"Georgian Supplement": range(11520, 11567 + 1),
"Tifinagh": range(11568, 11647 + 1),
"Ethiopic Extended": range(11648, 11743 + 1),
"Cyrillic Extended-A": range(11744, 11775 + 1),
"Supplemental Punctuation": range(11776, 11903 + 1),
"CJK Radicals Supplement": range(11904, 12031 + 1),
"Kangxi Radicals": range(12032, 12255 + 1),
"Ideographic Description Characters": range(12272, 12287 + 1),
"CJK Symbols and Punctuation": range(12288, 12351 + 1),
"Hiragana": range(12352, 12447 + 1),
"Katakana": range(12448, 12543 + 1),
"Bopomofo": range(12544, 12591 + 1),
"Hangul Compatibility Jamo": range(12592, 12687 + 1),
"Kanbun": range(12688, 12703 + 1),
"Bopomofo Extended": range(12704, 12735 + 1),
"CJK Strokes": range(12736, 12783 + 1),
"Katakana Phonetic Extensions": range(12784, 12799 + 1),
"Enclosed CJK Letters and Months": range(12800, 13055 + 1),
"CJK Compatibility": range(13056, 13311 + 1),
"CJK Unified Ideographs Extension A": range(13312, 19903 + 1),
"Yijing Hexagram Symbols": range(19904, 19967 + 1),
"CJK Unified Ideographs": range(19968, 40959 + 1),
"Yi Syllables": range(40960, 42127 + 1),
"Yi Radicals": range(42128, 42191 + 1),
"Lisu": range(42192, 42239 + 1),
"Vai": range(42240, 42559 + 1),
"Cyrillic Extended-B": range(42560, 42655 + 1),
"Bamum": range(42656, 42751 + 1),
"Modifier Tone Letters": range(42752, 42783 + 1),
"Latin Extended-D": range(42784, 43007 + 1),
"Syloti Nagri": range(43008, 43055 + 1),
"Common Indic Number Forms": range(43056, 43071 + 1),
"Phags-pa": range(43072, 43135 + 1),
"Saurashtra": range(43136, 43231 + 1),
"Devanagari Extended": range(43232, 43263 + 1),
"Kayah Li": range(43264, 43311 + 1),
"Rejang": range(43312, 43359 + 1),
"Hangul Jamo Extended-A": range(43360, 43391 + 1),
"Javanese": range(43392, 43487 + 1),
"Myanmar Extended-B": range(43488, 43519 + 1),
"Cham": range(43520, 43615 + 1),
"Myanmar Extended-A": range(43616, 43647 + 1),
"Tai Viet": range(43648, 43743 + 1),
"Meetei Mayek Extensions": range(43744, 43775 + 1),
"Ethiopic Extended-A": range(43776, 43823 + 1),
"Latin Extended-E": range(43824, 43887 + 1),
"Cherokee Supplement": range(43888, 43967 + 1),
"Meetei Mayek": range(43968, 44031 + 1),
"Hangul Syllables": range(44032, 55215 + 1),
"Hangul Jamo Extended-B": range(55216, 55295 + 1),
"High Surrogates": range(55296, 56191 + 1),
"High Private Use Surrogates": range(56192, 56319 + 1),
"Low Surrogates": range(56320, 57343 + 1),
"Private Use Area": range(57344, 63743 + 1),
"CJK Compatibility Ideographs": range(63744, 64255 + 1),
"Alphabetic Presentation Forms": range(64256, 64335 + 1),
"Arabic Presentation Forms-A": range(64336, 65023 + 1),
"Variation Selectors": range(65024, 65039 + 1),
"Vertical Forms": range(65040, 65055 + 1),
"Combining Half Marks": range(65056, 65071 + 1),
"CJK Compatibility Forms": range(65072, 65103 + 1),
"Small Form Variants": range(65104, 65135 + 1),
"Arabic Presentation Forms-B": range(65136, 65279 + 1),
"Halfwidth and Fullwidth Forms": range(65280, 65519 + 1),
"Specials": range(65520, 65535 + 1),
"Linear B Syllabary": range(65536, 65663 + 1),
"Linear B Ideograms": range(65664, 65791 + 1),
"Aegean Numbers": range(65792, 65855 + 1),
"Ancient Greek Numbers": range(65856, 65935 + 1),
"Ancient Symbols": range(65936, 65999 + 1),
"Phaistos Disc": range(66000, 66047 + 1),
"Lycian": range(66176, 66207 + 1),
"Carian": range(66208, 66271 + 1),
"Coptic Epact Numbers": range(66272, 66303 + 1),
"Old Italic": range(66304, 66351 + 1),
"Gothic": range(66352, 66383 + 1),
"Old Permic": range(66384, 66431 + 1),
"Ugaritic": range(66432, 66463 + 1),
"Old Persian": range(66464, 66527 + 1),
"Deseret": range(66560, 66639 + 1),
"Shavian": range(66640, 66687 + 1),
"Osmanya": range(66688, 66735 + 1),
"Osage": range(66736, 66815 + 1),
"Elbasan": range(66816, 66863 + 1),
"Caucasian Albanian": range(66864, 66927 + 1),
"Linear A": range(67072, 67455 + 1),
"Cypriot Syllabary": range(67584, 67647 + 1),
"Imperial Aramaic": range(67648, 67679 + 1),
"Palmyrene": range(67680, 67711 + 1),
"Nabataean": range(67712, 67759 + 1),
"Hatran": range(67808, 67839 + 1),
"Phoenician": range(67840, 67871 + 1),
"Lydian": range(67872, 67903 + 1),
"Meroitic Hieroglyphs": range(67968, 67999 + 1),
"Meroitic Cursive": range(68000, 68095 + 1),
"Kharoshthi": range(68096, 68191 + 1),
"Old South Arabian": range(68192, 68223 + 1),
"Old North Arabian": range(68224, 68255 + 1),
"Manichaean": range(68288, 68351 + 1),
"Avestan": range(68352, 68415 + 1),
"Inscriptional Parthian": range(68416, 68447 + 1),
"Inscriptional Pahlavi": range(68448, 68479 + 1),
"Psalter Pahlavi": range(68480, 68527 + 1),
"Old Turkic": range(68608, 68687 + 1),
"Old Hungarian": range(68736, 68863 + 1),
"Rumi Numeral Symbols": range(69216, 69247 + 1),
"Brahmi": range(69632, 69759 + 1),
"Kaithi": range(69760, 69839 + 1),
"Sora Sompeng": range(69840, 69887 + 1),
"Chakma": range(69888, 69967 + 1),
"Mahajani": range(69968, 70015 + 1),
"Sharada": range(70016, 70111 + 1),
"Sinhala Archaic Numbers": range(70112, 70143 + 1),
"Khojki": range(70144, 70223 + 1),
"Multani": range(70272, 70319 + 1),
"Khudawadi": range(70320, 70399 + 1),
"Grantha": range(70400, 70527 + 1),
"Newa": range(70656, 70783 + 1),
"Tirhuta": range(70784, 70879 + 1),
"Siddham": range(71040, 71167 + 1),
"Modi": range(71168, 71263 + 1),
"Mongolian Supplement": range(71264, 71295 + 1),
"Takri": range(71296, 71375 + 1),
"Ahom": range(71424, 71487 + 1),
"Warang Citi": range(71840, 71935 + 1),
"Zanabazar Square": range(72192, 72271 + 1),
"Soyombo": range(72272, 72367 + 1),
"Pau Cin Hau": range(72384, 72447 + 1),
"Bhaiksuki": range(72704, 72815 + 1),
"Marchen": range(72816, 72895 + 1),
"Masaram Gondi": range(72960, 73055 + 1),
"Cuneiform": range(73728, 74751 + 1),
"Cuneiform Numbers and Punctuation": range(74752, 74879 + 1),
"Early Dynastic Cuneiform": range(74880, 75087 + 1),
"Egyptian Hieroglyphs": range(77824, 78895 + 1),
"Anatolian Hieroglyphs": range(82944, 83583 + 1),
"Bamum Supplement": range(92160, 92735 + 1),
"Mro": range(92736, 92783 + 1),
"Bassa Vah": range(92880, 92927 + 1),
"Pahawh Hmong": range(92928, 93071 + 1),
"Miao": range(93952, 94111 + 1),
"Ideographic Symbols and Punctuation": range(94176, 94207 + 1),
"Tangut": range(94208, 100351 + 1),
"Tangut Components": range(100352, 101119 + 1),
"Kana Supplement": range(110592, 110847 + 1),
"Kana Extended-A": range(110848, 110895 + 1),
"Nushu": range(110960, 111359 + 1),
"Duployan": range(113664, 113823 + 1),
"Shorthand Format Controls": range(113824, 113839 + 1),
"Byzantine Musical Symbols": range(118784, 119039 + 1),
"Musical Symbols": range(119040, 119295 + 1),
"Ancient Greek Musical Notation": range(119296, 119375 + 1),
"Tai Xuan Jing Symbols": range(119552, 119647 + 1),
"Counting Rod Numerals": range(119648, 119679 + 1),
"Mathematical Alphanumeric Symbols": range(119808, 120831 + 1),
"Sutton SignWriting": range(120832, 121519 + 1),
"Glagolitic Supplement": range(122880, 122927 + 1),
"Mende Kikakui": range(124928, 125151 + 1),
"Adlam": range(125184, 125279 + 1),
"Arabic Mathematical Alphabetic Symbols": range(126464, 126719 + 1),
"Mahjong Tiles": range(126976, 127023 + 1),
"Domino Tiles": range(127024, 127135 + 1),
"Playing Cards": range(127136, 127231 + 1),
"Enclosed Alphanumeric Supplement": range(127232, 127487 + 1),
"Enclosed Ideographic Supplement": range(127488, 127743 + 1),
"Miscellaneous Symbols and Pictographs": range(127744, 128511 + 1),
"Emoticons range(Emoji)": range(128512, 128591 + 1),
"Ornamental Dingbats": range(128592, 128639 + 1),
"Transport and Map Symbols": range(128640, 128767 + 1),
"Alchemical Symbols": range(128768, 128895 + 1),
"Geometric Shapes Extended": range(128896, 129023 + 1),
"Supplemental Arrows-C": range(129024, 129279 + 1),
"Supplemental Symbols and Pictographs": range(129280, 129535 + 1),
"CJK Unified Ideographs Extension B": range(131072, 173791 + 1),
"CJK Unified Ideographs Extension C": range(173824, 177983 + 1),
"CJK Unified Ideographs Extension D": range(177984, 178207 + 1),
"CJK Unified Ideographs Extension E": range(178208, 183983 + 1),
"CJK Unified Ideographs Extension F": range(183984, 191471 + 1),
"CJK Compatibility Ideographs Supplement": range(194560, 195103 + 1),
"Tags": range(917504, 917631 + 1),
"Variation Selectors Supplement": range(917760, 917999 + 1),
}
UNICODE_SECONDARY_RANGE_KEYWORD: List[str] = [
"Supplement",
"Extended",
"Extensions",
"Modifier",
"Marks",
"Punctuation",
"Symbols",
"Forms",
"Operators",
"Miscellaneous",
"Drawing",
"Block",
"Shapes",
"Supplemental",
"Tags",
]
RE_POSSIBLE_ENCODING_INDICATION = re_compile(
r"(?:(?:encoding)|(?:charset)|(?:coding))(?:[\:= ]{1,10})(?:[\"\']?)([a-zA-Z0-9\-_]+)(?:[\"\']?)",
IGNORECASE,
)
IANA_SUPPORTED: List[str] = sorted(
filter(
lambda x: x.endswith("_codec") is False
and x not in {"rot_13", "tactis", "mbcs"},
list(set(aliases.values())),
)
)
IANA_SUPPORTED_COUNT: int = len(IANA_SUPPORTED)
# pre-computed code page that are similar using the function cp_similarity.
IANA_SUPPORTED_SIMILAR: Dict[str, List[str]] = {
"cp037": ["cp1026", "cp1140", "cp273", "cp500"],
"cp1026": ["cp037", "cp1140", "cp273", "cp500"],
"cp1125": ["cp866"],
"cp1140": ["cp037", "cp1026", "cp273", "cp500"],
"cp1250": ["iso8859_2"],
"cp1251": ["kz1048", "ptcp154"],
"cp1252": ["iso8859_15", "iso8859_9", "latin_1"],
"cp1253": ["iso8859_7"],
"cp1254": ["iso8859_15", "iso8859_9", "latin_1"],
"cp1257": ["iso8859_13"],
"cp273": ["cp037", "cp1026", "cp1140", "cp500"],
"cp437": ["cp850", "cp858", "cp860", "cp861", "cp862", "cp863", "cp865"],
"cp500": ["cp037", "cp1026", "cp1140", "cp273"],
"cp850": ["cp437", "cp857", "cp858", "cp865"],
"cp857": ["cp850", "cp858", "cp865"],
"cp858": ["cp437", "cp850", "cp857", "cp865"],
"cp860": ["cp437", "cp861", "cp862", "cp863", "cp865"],
"cp861": ["cp437", "cp860", "cp862", "cp863", "cp865"],
"cp862": ["cp437", "cp860", "cp861", "cp863", "cp865"],
"cp863": ["cp437", "cp860", "cp861", "cp862", "cp865"],
"cp865": ["cp437", "cp850", "cp857", "cp858", "cp860", "cp861", "cp862", "cp863"],
"cp866": ["cp1125"],
"iso8859_10": ["iso8859_14", "iso8859_15", "iso8859_4", "iso8859_9", "latin_1"],
"iso8859_11": ["tis_620"],
"iso8859_13": ["cp1257"],
"iso8859_14": [
"iso8859_10",
"iso8859_15",
"iso8859_16",
"iso8859_3",
"iso8859_9",
"latin_1",
],
"iso8859_15": [
"cp1252",
"cp1254",
"iso8859_10",
"iso8859_14",
"iso8859_16",
"iso8859_3",
"iso8859_9",
"latin_1",
],
"iso8859_16": [
"iso8859_14",
"iso8859_15",
"iso8859_2",
"iso8859_3",
"iso8859_9",
"latin_1",
],
"iso8859_2": ["cp1250", "iso8859_16", "iso8859_4"],
"iso8859_3": ["iso8859_14", "iso8859_15", "iso8859_16", "iso8859_9", "latin_1"],
"iso8859_4": ["iso8859_10", "iso8859_2", "iso8859_9", "latin_1"],
"iso8859_7": ["cp1253"],
"iso8859_9": [
"cp1252",
"cp1254",
"cp1258",
"iso8859_10",
"iso8859_14",
"iso8859_15",
"iso8859_16",
"iso8859_3",
"iso8859_4",
"latin_1",
],
"kz1048": ["cp1251", "ptcp154"],
"latin_1": [
"cp1252",
"cp1254",
"cp1258",
"iso8859_10",
"iso8859_14",
"iso8859_15",
"iso8859_16",
"iso8859_3",
"iso8859_4",
"iso8859_9",
],
"mac_iceland": ["mac_roman", "mac_turkish"],
"mac_roman": ["mac_iceland", "mac_turkish"],
"mac_turkish": ["mac_iceland", "mac_roman"],
"ptcp154": ["cp1251", "kz1048"],
"tis_620": ["iso8859_11"],
}
CHARDET_CORRESPONDENCE: Dict[str, str] = {
"iso2022_kr": "ISO-2022-KR",
"iso2022_jp": "ISO-2022-JP",
"euc_kr": "EUC-KR",
"tis_620": "TIS-620",
"utf_32": "UTF-32",
"euc_jp": "EUC-JP",
"koi8_r": "KOI8-R",
"iso8859_1": "ISO-8859-1",
"iso8859_2": "ISO-8859-2",
"iso8859_5": "ISO-8859-5",
"iso8859_6": "ISO-8859-6",
"iso8859_7": "ISO-8859-7",
"iso8859_8": "ISO-8859-8",
"utf_16": "UTF-16",
"cp855": "IBM855",
"mac_cyrillic": "MacCyrillic",
"gb2312": "GB2312",
"gb18030": "GB18030",
"cp932": "CP932",
"cp866": "IBM866",
"utf_8": "utf-8",
"utf_8_sig": "UTF-8-SIG",
"shift_jis": "SHIFT_JIS",
"big5": "Big5",
"cp1250": "windows-1250",
"cp1251": "windows-1251",
"cp1252": "Windows-1252",
"cp1253": "windows-1253",
"cp1255": "windows-1255",
"cp1256": "windows-1256",
"cp1254": "Windows-1254",
"cp949": "CP949",
}
COMMON_SAFE_ASCII_CHARACTERS: Set[str] = {
"<",
">",
"=",
":",
"/",
"&",
";",
"{",
"}",
"[",
"]",
",",
"|",
'"',
"-",
}
KO_NAMES: Set[str] = {"johab", "cp949", "euc_kr"}
ZH_NAMES: Set[str] = {"big5", "cp950", "big5hkscs", "hz"}
NOT_PRINTABLE_PATTERN = re_compile(r"[0-9\W\n\r\t]+")
LANGUAGE_SUPPORTED_COUNT: int = len(FREQUENCIES)
# Logging LEVEL bellow DEBUG
TRACE: int = 5

View File

@ -0,0 +1,95 @@
import warnings
from typing import Dict, Optional, Union
from .api import from_bytes, from_fp, from_path, normalize
from .constant import CHARDET_CORRESPONDENCE
from .models import CharsetMatch, CharsetMatches
def detect(byte_str: bytes) -> Dict[str, Optional[Union[str, float]]]:
"""
chardet legacy method
Detect the encoding of the given byte string. It should be mostly backward-compatible.
Encoding name will match Chardet own writing whenever possible. (Not on encoding name unsupported by it)
This function is deprecated and should be used to migrate your project easily, consult the documentation for
further information. Not planned for removal.
:param byte_str: The byte sequence to examine.
"""
if not isinstance(byte_str, (bytearray, bytes)):
raise TypeError( # pragma: nocover
"Expected object of type bytes or bytearray, got: "
"{0}".format(type(byte_str))
)
if isinstance(byte_str, bytearray):
byte_str = bytes(byte_str)
r = from_bytes(byte_str).best()
encoding = r.encoding if r is not None else None
language = r.language if r is not None and r.language != "Unknown" else ""
confidence = 1.0 - r.chaos if r is not None else None
# Note: CharsetNormalizer does not return 'UTF-8-SIG' as the sig get stripped in the detection/normalization process
# but chardet does return 'utf-8-sig' and it is a valid codec name.
if r is not None and encoding == "utf_8" and r.bom:
encoding += "_sig"
return {
"encoding": encoding
if encoding not in CHARDET_CORRESPONDENCE
else CHARDET_CORRESPONDENCE[encoding],
"language": language,
"confidence": confidence,
}
class CharsetNormalizerMatch(CharsetMatch):
pass
class CharsetNormalizerMatches(CharsetMatches):
@staticmethod
def from_fp(*args, **kwargs): # type: ignore
warnings.warn( # pragma: nocover
"staticmethod from_fp, from_bytes, from_path and normalize are deprecated "
"and scheduled to be removed in 3.0",
DeprecationWarning,
)
return from_fp(*args, **kwargs) # pragma: nocover
@staticmethod
def from_bytes(*args, **kwargs): # type: ignore
warnings.warn( # pragma: nocover
"staticmethod from_fp, from_bytes, from_path and normalize are deprecated "
"and scheduled to be removed in 3.0",
DeprecationWarning,
)
return from_bytes(*args, **kwargs) # pragma: nocover
@staticmethod
def from_path(*args, **kwargs): # type: ignore
warnings.warn( # pragma: nocover
"staticmethod from_fp, from_bytes, from_path and normalize are deprecated "
"and scheduled to be removed in 3.0",
DeprecationWarning,
)
return from_path(*args, **kwargs) # pragma: nocover
@staticmethod
def normalize(*args, **kwargs): # type: ignore
warnings.warn( # pragma: nocover
"staticmethod from_fp, from_bytes, from_path and normalize are deprecated "
"and scheduled to be removed in 3.0",
DeprecationWarning,
)
return normalize(*args, **kwargs) # pragma: nocover
class CharsetDetector(CharsetNormalizerMatches):
pass
class CharsetDoctor(CharsetNormalizerMatches):
pass

View File

@ -0,0 +1,553 @@
from functools import lru_cache
from typing import List, Optional
from .constant import COMMON_SAFE_ASCII_CHARACTERS, UNICODE_SECONDARY_RANGE_KEYWORD
from .utils import (
is_accentuated,
is_ascii,
is_case_variable,
is_cjk,
is_emoticon,
is_hangul,
is_hiragana,
is_katakana,
is_latin,
is_punctuation,
is_separator,
is_symbol,
is_thai,
is_unprintable,
remove_accent,
unicode_range,
)
class MessDetectorPlugin:
"""
Base abstract class used for mess detection plugins.
All detectors MUST extend and implement given methods.
"""
def eligible(self, character: str) -> bool:
"""
Determine if given character should be fed in.
"""
raise NotImplementedError # pragma: nocover
def feed(self, character: str) -> None:
"""
The main routine to be executed upon character.
Insert the logic in witch the text would be considered chaotic.
"""
raise NotImplementedError # pragma: nocover
def reset(self) -> None: # pragma: no cover
"""
Permit to reset the plugin to the initial state.
"""
raise NotImplementedError
@property
def ratio(self) -> float:
"""
Compute the chaos ratio based on what your feed() has seen.
Must NOT be lower than 0.; No restriction gt 0.
"""
raise NotImplementedError # pragma: nocover
class TooManySymbolOrPunctuationPlugin(MessDetectorPlugin):
def __init__(self) -> None:
self._punctuation_count: int = 0
self._symbol_count: int = 0
self._character_count: int = 0
self._last_printable_char: Optional[str] = None
self._frenzy_symbol_in_word: bool = False
def eligible(self, character: str) -> bool:
return character.isprintable()
def feed(self, character: str) -> None:
self._character_count += 1
if (
character != self._last_printable_char
and character not in COMMON_SAFE_ASCII_CHARACTERS
):
if is_punctuation(character):
self._punctuation_count += 1
elif (
character.isdigit() is False
and is_symbol(character)
and is_emoticon(character) is False
):
self._symbol_count += 2
self._last_printable_char = character
def reset(self) -> None: # pragma: no cover
self._punctuation_count = 0
self._character_count = 0
self._symbol_count = 0
@property
def ratio(self) -> float:
if self._character_count == 0:
return 0.0
ratio_of_punctuation: float = (
self._punctuation_count + self._symbol_count
) / self._character_count
return ratio_of_punctuation if ratio_of_punctuation >= 0.3 else 0.0
class TooManyAccentuatedPlugin(MessDetectorPlugin):
def __init__(self) -> None:
self._character_count: int = 0
self._accentuated_count: int = 0
def eligible(self, character: str) -> bool:
return character.isalpha()
def feed(self, character: str) -> None:
self._character_count += 1
if is_accentuated(character):
self._accentuated_count += 1
def reset(self) -> None: # pragma: no cover
self._character_count = 0
self._accentuated_count = 0
@property
def ratio(self) -> float:
if self._character_count == 0:
return 0.0
ratio_of_accentuation: float = self._accentuated_count / self._character_count
return ratio_of_accentuation if ratio_of_accentuation >= 0.35 else 0.0
class UnprintablePlugin(MessDetectorPlugin):
def __init__(self) -> None:
self._unprintable_count: int = 0
self._character_count: int = 0
def eligible(self, character: str) -> bool:
return True
def feed(self, character: str) -> None:
if is_unprintable(character):
self._unprintable_count += 1
self._character_count += 1
def reset(self) -> None: # pragma: no cover
self._unprintable_count = 0
@property
def ratio(self) -> float:
if self._character_count == 0:
return 0.0
return (self._unprintable_count * 8) / self._character_count
class SuspiciousDuplicateAccentPlugin(MessDetectorPlugin):
def __init__(self) -> None:
self._successive_count: int = 0
self._character_count: int = 0
self._last_latin_character: Optional[str] = None
def eligible(self, character: str) -> bool:
return character.isalpha() and is_latin(character)
def feed(self, character: str) -> None:
self._character_count += 1
if (
self._last_latin_character is not None
and is_accentuated(character)
and is_accentuated(self._last_latin_character)
):
if character.isupper() and self._last_latin_character.isupper():
self._successive_count += 1
# Worse if its the same char duplicated with different accent.
if remove_accent(character) == remove_accent(self._last_latin_character):
self._successive_count += 1
self._last_latin_character = character
def reset(self) -> None: # pragma: no cover
self._successive_count = 0
self._character_count = 0
self._last_latin_character = None
@property
def ratio(self) -> float:
if self._character_count == 0:
return 0.0
return (self._successive_count * 2) / self._character_count
class SuspiciousRange(MessDetectorPlugin):
def __init__(self) -> None:
self._suspicious_successive_range_count: int = 0
self._character_count: int = 0
self._last_printable_seen: Optional[str] = None
def eligible(self, character: str) -> bool:
return character.isprintable()
def feed(self, character: str) -> None:
self._character_count += 1
if (
character.isspace()
or is_punctuation(character)
or character in COMMON_SAFE_ASCII_CHARACTERS
):
self._last_printable_seen = None
return
if self._last_printable_seen is None:
self._last_printable_seen = character
return
unicode_range_a: Optional[str] = unicode_range(self._last_printable_seen)
unicode_range_b: Optional[str] = unicode_range(character)
if is_suspiciously_successive_range(unicode_range_a, unicode_range_b):
self._suspicious_successive_range_count += 1
self._last_printable_seen = character
def reset(self) -> None: # pragma: no cover
self._character_count = 0
self._suspicious_successive_range_count = 0
self._last_printable_seen = None
@property
def ratio(self) -> float:
if self._character_count == 0:
return 0.0
ratio_of_suspicious_range_usage: float = (
self._suspicious_successive_range_count * 2
) / self._character_count
if ratio_of_suspicious_range_usage < 0.1:
return 0.0
return ratio_of_suspicious_range_usage
class SuperWeirdWordPlugin(MessDetectorPlugin):
def __init__(self) -> None:
self._word_count: int = 0
self._bad_word_count: int = 0
self._foreign_long_count: int = 0
self._is_current_word_bad: bool = False
self._foreign_long_watch: bool = False
self._character_count: int = 0
self._bad_character_count: int = 0
self._buffer: str = ""
self._buffer_accent_count: int = 0
def eligible(self, character: str) -> bool:
return True
def feed(self, character: str) -> None:
if character.isalpha():
self._buffer += character
if is_accentuated(character):
self._buffer_accent_count += 1
if (
self._foreign_long_watch is False
and (is_latin(character) is False or is_accentuated(character))
and is_cjk(character) is False
and is_hangul(character) is False
and is_katakana(character) is False
and is_hiragana(character) is False
and is_thai(character) is False
):
self._foreign_long_watch = True
return
if not self._buffer:
return
if (
character.isspace() or is_punctuation(character) or is_separator(character)
) and self._buffer:
self._word_count += 1
buffer_length: int = len(self._buffer)
self._character_count += buffer_length
if buffer_length >= 4:
if self._buffer_accent_count / buffer_length > 0.34:
self._is_current_word_bad = True
# Word/Buffer ending with a upper case accentuated letter are so rare,
# that we will consider them all as suspicious. Same weight as foreign_long suspicious.
if is_accentuated(self._buffer[-1]) and self._buffer[-1].isupper():
self._foreign_long_count += 1
self._is_current_word_bad = True
if buffer_length >= 24 and self._foreign_long_watch:
self._foreign_long_count += 1
self._is_current_word_bad = True
if self._is_current_word_bad:
self._bad_word_count += 1
self._bad_character_count += len(self._buffer)
self._is_current_word_bad = False
self._foreign_long_watch = False
self._buffer = ""
self._buffer_accent_count = 0
elif (
character not in {"<", ">", "-", "=", "~", "|", "_"}
and character.isdigit() is False
and is_symbol(character)
):
self._is_current_word_bad = True
self._buffer += character
def reset(self) -> None: # pragma: no cover
self._buffer = ""
self._is_current_word_bad = False
self._foreign_long_watch = False
self._bad_word_count = 0
self._word_count = 0
self._character_count = 0
self._bad_character_count = 0
self._foreign_long_count = 0
@property
def ratio(self) -> float:
if self._word_count <= 10 and self._foreign_long_count == 0:
return 0.0
return self._bad_character_count / self._character_count
class CjkInvalidStopPlugin(MessDetectorPlugin):
"""
GB(Chinese) based encoding often render the stop incorrectly when the content does not fit and
can be easily detected. Searching for the overuse of '' and ''.
"""
def __init__(self) -> None:
self._wrong_stop_count: int = 0
self._cjk_character_count: int = 0
def eligible(self, character: str) -> bool:
return True
def feed(self, character: str) -> None:
if character in {"", ""}:
self._wrong_stop_count += 1
return
if is_cjk(character):
self._cjk_character_count += 1
def reset(self) -> None: # pragma: no cover
self._wrong_stop_count = 0
self._cjk_character_count = 0
@property
def ratio(self) -> float:
if self._cjk_character_count < 16:
return 0.0
return self._wrong_stop_count / self._cjk_character_count
class ArchaicUpperLowerPlugin(MessDetectorPlugin):
def __init__(self) -> None:
self._buf: bool = False
self._character_count_since_last_sep: int = 0
self._successive_upper_lower_count: int = 0
self._successive_upper_lower_count_final: int = 0
self._character_count: int = 0
self._last_alpha_seen: Optional[str] = None
self._current_ascii_only: bool = True
def eligible(self, character: str) -> bool:
return True
def feed(self, character: str) -> None:
is_concerned = character.isalpha() and is_case_variable(character)
chunk_sep = is_concerned is False
if chunk_sep and self._character_count_since_last_sep > 0:
if (
self._character_count_since_last_sep <= 64
and character.isdigit() is False
and self._current_ascii_only is False
):
self._successive_upper_lower_count_final += (
self._successive_upper_lower_count
)
self._successive_upper_lower_count = 0
self._character_count_since_last_sep = 0
self._last_alpha_seen = None
self._buf = False
self._character_count += 1
self._current_ascii_only = True
return
if self._current_ascii_only is True and is_ascii(character) is False:
self._current_ascii_only = False
if self._last_alpha_seen is not None:
if (character.isupper() and self._last_alpha_seen.islower()) or (
character.islower() and self._last_alpha_seen.isupper()
):
if self._buf is True:
self._successive_upper_lower_count += 2
self._buf = False
else:
self._buf = True
else:
self._buf = False
self._character_count += 1
self._character_count_since_last_sep += 1
self._last_alpha_seen = character
def reset(self) -> None: # pragma: no cover
self._character_count = 0
self._character_count_since_last_sep = 0
self._successive_upper_lower_count = 0
self._successive_upper_lower_count_final = 0
self._last_alpha_seen = None
self._buf = False
self._current_ascii_only = True
@property
def ratio(self) -> float:
if self._character_count == 0:
return 0.0
return self._successive_upper_lower_count_final / self._character_count
@lru_cache(maxsize=1024)
def is_suspiciously_successive_range(
unicode_range_a: Optional[str], unicode_range_b: Optional[str]
) -> bool:
"""
Determine if two Unicode range seen next to each other can be considered as suspicious.
"""
if unicode_range_a is None or unicode_range_b is None:
return True
if unicode_range_a == unicode_range_b:
return False
if "Latin" in unicode_range_a and "Latin" in unicode_range_b:
return False
if "Emoticons" in unicode_range_a or "Emoticons" in unicode_range_b:
return False
# Latin characters can be accompanied with a combining diacritical mark
# eg. Vietnamese.
if ("Latin" in unicode_range_a or "Latin" in unicode_range_b) and (
"Combining" in unicode_range_a or "Combining" in unicode_range_b
):
return False
keywords_range_a, keywords_range_b = unicode_range_a.split(
" "
), unicode_range_b.split(" ")
for el in keywords_range_a:
if el in UNICODE_SECONDARY_RANGE_KEYWORD:
continue
if el in keywords_range_b:
return False
# Japanese Exception
range_a_jp_chars, range_b_jp_chars = (
unicode_range_a
in (
"Hiragana",
"Katakana",
),
unicode_range_b in ("Hiragana", "Katakana"),
)
if (range_a_jp_chars or range_b_jp_chars) and (
"CJK" in unicode_range_a or "CJK" in unicode_range_b
):
return False
if range_a_jp_chars and range_b_jp_chars:
return False
if "Hangul" in unicode_range_a or "Hangul" in unicode_range_b:
if "CJK" in unicode_range_a or "CJK" in unicode_range_b:
return False
if unicode_range_a == "Basic Latin" or unicode_range_b == "Basic Latin":
return False
# Chinese/Japanese use dedicated range for punctuation and/or separators.
if ("CJK" in unicode_range_a or "CJK" in unicode_range_b) or (
unicode_range_a in ["Katakana", "Hiragana"]
and unicode_range_b in ["Katakana", "Hiragana"]
):
if "Punctuation" in unicode_range_a or "Punctuation" in unicode_range_b:
return False
if "Forms" in unicode_range_a or "Forms" in unicode_range_b:
return False
return True
@lru_cache(maxsize=2048)
def mess_ratio(
decoded_sequence: str, maximum_threshold: float = 0.2, debug: bool = False
) -> float:
"""
Compute a mess ratio given a decoded bytes sequence. The maximum threshold does stop the computation earlier.
"""
detectors: List[MessDetectorPlugin] = [
md_class() for md_class in MessDetectorPlugin.__subclasses__()
]
length: int = len(decoded_sequence) + 1
mean_mess_ratio: float = 0.0
if length < 512:
intermediary_mean_mess_ratio_calc: int = 32
elif length <= 1024:
intermediary_mean_mess_ratio_calc = 64
else:
intermediary_mean_mess_ratio_calc = 128
for character, index in zip(decoded_sequence + "\n", range(length)):
for detector in detectors:
if detector.eligible(character):
detector.feed(character)
if (
index > 0 and index % intermediary_mean_mess_ratio_calc == 0
) or index == length - 1:
mean_mess_ratio = sum(dt.ratio for dt in detectors)
if mean_mess_ratio >= maximum_threshold:
break
if debug:
for dt in detectors: # pragma: nocover
print(dt.__class__, dt.ratio)
return round(mean_mess_ratio, 3)

View File

@ -0,0 +1,401 @@
import warnings
from collections import Counter
from encodings.aliases import aliases
from hashlib import sha256
from json import dumps
from re import sub
from typing import (
Any,
Counter as TypeCounter,
Dict,
Iterator,
List,
Optional,
Tuple,
Union,
)
from .constant import NOT_PRINTABLE_PATTERN, TOO_BIG_SEQUENCE
from .md import mess_ratio
from .utils import iana_name, is_multi_byte_encoding, unicode_range
class CharsetMatch:
def __init__(
self,
payload: bytes,
guessed_encoding: str,
mean_mess_ratio: float,
has_sig_or_bom: bool,
languages: "CoherenceMatches",
decoded_payload: Optional[str] = None,
):
self._payload: bytes = payload
self._encoding: str = guessed_encoding
self._mean_mess_ratio: float = mean_mess_ratio
self._languages: CoherenceMatches = languages
self._has_sig_or_bom: bool = has_sig_or_bom
self._unicode_ranges: Optional[List[str]] = None
self._leaves: List[CharsetMatch] = []
self._mean_coherence_ratio: float = 0.0
self._output_payload: Optional[bytes] = None
self._output_encoding: Optional[str] = None
self._string: Optional[str] = decoded_payload
def __eq__(self, other: object) -> bool:
if not isinstance(other, CharsetMatch):
raise TypeError(
"__eq__ cannot be invoked on {} and {}.".format(
str(other.__class__), str(self.__class__)
)
)
return self.encoding == other.encoding and self.fingerprint == other.fingerprint
def __lt__(self, other: object) -> bool:
"""
Implemented to make sorted available upon CharsetMatches items.
"""
if not isinstance(other, CharsetMatch):
raise ValueError
chaos_difference: float = abs(self.chaos - other.chaos)
coherence_difference: float = abs(self.coherence - other.coherence)
# Bellow 1% difference --> Use Coherence
if chaos_difference < 0.01 and coherence_difference > 0.02:
# When having a tough decision, use the result that decoded as many multi-byte as possible.
if chaos_difference == 0.0 and self.coherence == other.coherence:
return self.multi_byte_usage > other.multi_byte_usage
return self.coherence > other.coherence
return self.chaos < other.chaos
@property
def multi_byte_usage(self) -> float:
return 1.0 - len(str(self)) / len(self.raw)
@property
def chaos_secondary_pass(self) -> float:
"""
Check once again chaos in decoded text, except this time, with full content.
Use with caution, this can be very slow.
Notice: Will be removed in 3.0
"""
warnings.warn(
"chaos_secondary_pass is deprecated and will be removed in 3.0",
DeprecationWarning,
)
return mess_ratio(str(self), 1.0)
@property
def coherence_non_latin(self) -> float:
"""
Coherence ratio on the first non-latin language detected if ANY.
Notice: Will be removed in 3.0
"""
warnings.warn(
"coherence_non_latin is deprecated and will be removed in 3.0",
DeprecationWarning,
)
return 0.0
@property
def w_counter(self) -> TypeCounter[str]:
"""
Word counter instance on decoded text.
Notice: Will be removed in 3.0
"""
warnings.warn(
"w_counter is deprecated and will be removed in 3.0", DeprecationWarning
)
string_printable_only = sub(NOT_PRINTABLE_PATTERN, " ", str(self).lower())
return Counter(string_printable_only.split())
def __str__(self) -> str:
# Lazy Str Loading
if self._string is None:
self._string = str(self._payload, self._encoding, "strict")
return self._string
def __repr__(self) -> str:
return "<CharsetMatch '{}' bytes({})>".format(self.encoding, self.fingerprint)
def add_submatch(self, other: "CharsetMatch") -> None:
if not isinstance(other, CharsetMatch) or other == self:
raise ValueError(
"Unable to add instance <{}> as a submatch of a CharsetMatch".format(
other.__class__
)
)
other._string = None # Unload RAM usage; dirty trick.
self._leaves.append(other)
@property
def encoding(self) -> str:
return self._encoding
@property
def encoding_aliases(self) -> List[str]:
"""
Encoding name are known by many name, using this could help when searching for IBM855 when it's listed as CP855.
"""
also_known_as: List[str] = []
for u, p in aliases.items():
if self.encoding == u:
also_known_as.append(p)
elif self.encoding == p:
also_known_as.append(u)
return also_known_as
@property
def bom(self) -> bool:
return self._has_sig_or_bom
@property
def byte_order_mark(self) -> bool:
return self._has_sig_or_bom
@property
def languages(self) -> List[str]:
"""
Return the complete list of possible languages found in decoded sequence.
Usually not really useful. Returned list may be empty even if 'language' property return something != 'Unknown'.
"""
return [e[0] for e in self._languages]
@property
def language(self) -> str:
"""
Most probable language found in decoded sequence. If none were detected or inferred, the property will return
"Unknown".
"""
if not self._languages:
# Trying to infer the language based on the given encoding
# Its either English or we should not pronounce ourselves in certain cases.
if "ascii" in self.could_be_from_charset:
return "English"
# doing it there to avoid circular import
from charset_normalizer.cd import encoding_languages, mb_encoding_languages
languages = (
mb_encoding_languages(self.encoding)
if is_multi_byte_encoding(self.encoding)
else encoding_languages(self.encoding)
)
if len(languages) == 0 or "Latin Based" in languages:
return "Unknown"
return languages[0]
return self._languages[0][0]
@property
def chaos(self) -> float:
return self._mean_mess_ratio
@property
def coherence(self) -> float:
if not self._languages:
return 0.0
return self._languages[0][1]
@property
def percent_chaos(self) -> float:
return round(self.chaos * 100, ndigits=3)
@property
def percent_coherence(self) -> float:
return round(self.coherence * 100, ndigits=3)
@property
def raw(self) -> bytes:
"""
Original untouched bytes.
"""
return self._payload
@property
def submatch(self) -> List["CharsetMatch"]:
return self._leaves
@property
def has_submatch(self) -> bool:
return len(self._leaves) > 0
@property
def alphabets(self) -> List[str]:
if self._unicode_ranges is not None:
return self._unicode_ranges
# list detected ranges
detected_ranges: List[Optional[str]] = [
unicode_range(char) for char in str(self)
]
# filter and sort
self._unicode_ranges = sorted(list({r for r in detected_ranges if r}))
return self._unicode_ranges
@property
def could_be_from_charset(self) -> List[str]:
"""
The complete list of encoding that output the exact SAME str result and therefore could be the originating
encoding.
This list does include the encoding available in property 'encoding'.
"""
return [self._encoding] + [m.encoding for m in self._leaves]
def first(self) -> "CharsetMatch":
"""
Kept for BC reasons. Will be removed in 3.0.
"""
return self
def best(self) -> "CharsetMatch":
"""
Kept for BC reasons. Will be removed in 3.0.
"""
return self
def output(self, encoding: str = "utf_8") -> bytes:
"""
Method to get re-encoded bytes payload using given target encoding. Default to UTF-8.
Any errors will be simply ignored by the encoder NOT replaced.
"""
if self._output_encoding is None or self._output_encoding != encoding:
self._output_encoding = encoding
self._output_payload = str(self).encode(encoding, "replace")
return self._output_payload # type: ignore
@property
def fingerprint(self) -> str:
"""
Retrieve the unique SHA256 computed using the transformed (re-encoded) payload. Not the original one.
"""
return sha256(self.output()).hexdigest()
class CharsetMatches:
"""
Container with every CharsetMatch items ordered by default from most probable to the less one.
Act like a list(iterable) but does not implements all related methods.
"""
def __init__(self, results: Optional[List[CharsetMatch]] = None):
self._results: List[CharsetMatch] = sorted(results) if results else []
def __iter__(self) -> Iterator[CharsetMatch]:
yield from self._results
def __getitem__(self, item: Union[int, str]) -> CharsetMatch:
"""
Retrieve a single item either by its position or encoding name (alias may be used here).
Raise KeyError upon invalid index or encoding not present in results.
"""
if isinstance(item, int):
return self._results[item]
if isinstance(item, str):
item = iana_name(item, False)
for result in self._results:
if item in result.could_be_from_charset:
return result
raise KeyError
def __len__(self) -> int:
return len(self._results)
def __bool__(self) -> bool:
return len(self._results) > 0
def append(self, item: CharsetMatch) -> None:
"""
Insert a single match. Will be inserted accordingly to preserve sort.
Can be inserted as a submatch.
"""
if not isinstance(item, CharsetMatch):
raise ValueError(
"Cannot append instance '{}' to CharsetMatches".format(
str(item.__class__)
)
)
# We should disable the submatch factoring when the input file is too heavy (conserve RAM usage)
if len(item.raw) <= TOO_BIG_SEQUENCE:
for match in self._results:
if match.fingerprint == item.fingerprint and match.chaos == item.chaos:
match.add_submatch(item)
return
self._results.append(item)
self._results = sorted(self._results)
def best(self) -> Optional["CharsetMatch"]:
"""
Simply return the first match. Strict equivalent to matches[0].
"""
if not self._results:
return None
return self._results[0]
def first(self) -> Optional["CharsetMatch"]:
"""
Redundant method, call the method best(). Kept for BC reasons.
"""
return self.best()
CoherenceMatch = Tuple[str, float]
CoherenceMatches = List[CoherenceMatch]
class CliDetectionResult:
def __init__(
self,
path: str,
encoding: Optional[str],
encoding_aliases: List[str],
alternative_encodings: List[str],
language: str,
alphabets: List[str],
has_sig_or_bom: bool,
chaos: float,
coherence: float,
unicode_path: Optional[str],
is_preferred: bool,
):
self.path: str = path
self.unicode_path: Optional[str] = unicode_path
self.encoding: Optional[str] = encoding
self.encoding_aliases: List[str] = encoding_aliases
self.alternative_encodings: List[str] = alternative_encodings
self.language: str = language
self.alphabets: List[str] = alphabets
self.has_sig_or_bom: bool = has_sig_or_bom
self.chaos: float = chaos
self.coherence: float = coherence
self.is_preferred: bool = is_preferred
@property
def __dict__(self) -> Dict[str, Any]: # type: ignore
return {
"path": self.path,
"encoding": self.encoding,
"encoding_aliases": self.encoding_aliases,
"alternative_encodings": self.alternative_encodings,
"language": self.language,
"alphabets": self.alphabets,
"has_sig_or_bom": self.has_sig_or_bom,
"chaos": self.chaos,
"coherence": self.coherence,
"unicode_path": self.unicode_path,
"is_preferred": self.is_preferred,
}
def to_json(self) -> str:
return dumps(self.__dict__, ensure_ascii=True, indent=4)

View File

@ -0,0 +1,424 @@
try:
# WARNING: unicodedata2 support is going to be removed in 3.0
# Python is quickly catching up.
import unicodedata2 as unicodedata
except ImportError:
import unicodedata # type: ignore[no-redef]
import importlib
import logging
from codecs import IncrementalDecoder
from encodings.aliases import aliases
from functools import lru_cache
from re import findall
from typing import Generator, List, Optional, Set, Tuple, Union
from _multibytecodec import MultibyteIncrementalDecoder
from .constant import (
ENCODING_MARKS,
IANA_SUPPORTED_SIMILAR,
RE_POSSIBLE_ENCODING_INDICATION,
UNICODE_RANGES_COMBINED,
UNICODE_SECONDARY_RANGE_KEYWORD,
UTF8_MAXIMAL_ALLOCATION,
)
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_accentuated(character: str) -> bool:
try:
description: str = unicodedata.name(character)
except ValueError:
return False
return (
"WITH GRAVE" in description
or "WITH ACUTE" in description
or "WITH CEDILLA" in description
or "WITH DIAERESIS" in description
or "WITH CIRCUMFLEX" in description
or "WITH TILDE" in description
)
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def remove_accent(character: str) -> str:
decomposed: str = unicodedata.decomposition(character)
if not decomposed:
return character
codes: List[str] = decomposed.split(" ")
return chr(int(codes[0], 16))
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def unicode_range(character: str) -> Optional[str]:
"""
Retrieve the Unicode range official name from a single character.
"""
character_ord: int = ord(character)
for range_name, ord_range in UNICODE_RANGES_COMBINED.items():
if character_ord in ord_range:
return range_name
return None
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_latin(character: str) -> bool:
try:
description: str = unicodedata.name(character)
except ValueError:
return False
return "LATIN" in description
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_ascii(character: str) -> bool:
try:
character.encode("ascii")
except UnicodeEncodeError:
return False
return True
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_punctuation(character: str) -> bool:
character_category: str = unicodedata.category(character)
if "P" in character_category:
return True
character_range: Optional[str] = unicode_range(character)
if character_range is None:
return False
return "Punctuation" in character_range
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_symbol(character: str) -> bool:
character_category: str = unicodedata.category(character)
if "S" in character_category or "N" in character_category:
return True
character_range: Optional[str] = unicode_range(character)
if character_range is None:
return False
return "Forms" in character_range
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_emoticon(character: str) -> bool:
character_range: Optional[str] = unicode_range(character)
if character_range is None:
return False
return "Emoticons" in character_range
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_separator(character: str) -> bool:
if character.isspace() or character in {"", "+", ",", ";", "<", ">"}:
return True
character_category: str = unicodedata.category(character)
return "Z" in character_category
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_case_variable(character: str) -> bool:
return character.islower() != character.isupper()
def is_private_use_only(character: str) -> bool:
character_category: str = unicodedata.category(character)
return character_category == "Co"
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_cjk(character: str) -> bool:
try:
character_name = unicodedata.name(character)
except ValueError:
return False
return "CJK" in character_name
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_hiragana(character: str) -> bool:
try:
character_name = unicodedata.name(character)
except ValueError:
return False
return "HIRAGANA" in character_name
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_katakana(character: str) -> bool:
try:
character_name = unicodedata.name(character)
except ValueError:
return False
return "KATAKANA" in character_name
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_hangul(character: str) -> bool:
try:
character_name = unicodedata.name(character)
except ValueError:
return False
return "HANGUL" in character_name
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_thai(character: str) -> bool:
try:
character_name = unicodedata.name(character)
except ValueError:
return False
return "THAI" in character_name
@lru_cache(maxsize=len(UNICODE_RANGES_COMBINED))
def is_unicode_range_secondary(range_name: str) -> bool:
return any(keyword in range_name for keyword in UNICODE_SECONDARY_RANGE_KEYWORD)
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_unprintable(character: str) -> bool:
return (
character.isspace() is False # includes \n \t \r \v
and character.isprintable() is False
and character != "\x1A" # Why? Its the ASCII substitute character.
and character != "\ufeff" # bug discovered in Python,
# Zero Width No-Break Space located in Arabic Presentation Forms-B, Unicode 1.1 not acknowledged as space.
)
def any_specified_encoding(sequence: bytes, search_zone: int = 4096) -> Optional[str]:
"""
Extract using ASCII-only decoder any specified encoding in the first n-bytes.
"""
if not isinstance(sequence, bytes):
raise TypeError
seq_len: int = len(sequence)
results: List[str] = findall(
RE_POSSIBLE_ENCODING_INDICATION,
sequence[: min(seq_len, search_zone)].decode("ascii", errors="ignore"),
)
if len(results) == 0:
return None
for specified_encoding in results:
specified_encoding = specified_encoding.lower().replace("-", "_")
encoding_alias: str
encoding_iana: str
for encoding_alias, encoding_iana in aliases.items():
if encoding_alias == specified_encoding:
return encoding_iana
if encoding_iana == specified_encoding:
return encoding_iana
return None
@lru_cache(maxsize=128)
def is_multi_byte_encoding(name: str) -> bool:
"""
Verify is a specific encoding is a multi byte one based on it IANA name
"""
return name in {
"utf_8",
"utf_8_sig",
"utf_16",
"utf_16_be",
"utf_16_le",
"utf_32",
"utf_32_le",
"utf_32_be",
"utf_7",
} or issubclass(
importlib.import_module("encodings.{}".format(name)).IncrementalDecoder,
MultibyteIncrementalDecoder,
)
def identify_sig_or_bom(sequence: bytes) -> Tuple[Optional[str], bytes]:
"""
Identify and extract SIG/BOM in given sequence.
"""
for iana_encoding in ENCODING_MARKS:
marks: Union[bytes, List[bytes]] = ENCODING_MARKS[iana_encoding]
if isinstance(marks, bytes):
marks = [marks]
for mark in marks:
if sequence.startswith(mark):
return iana_encoding, mark
return None, b""
def should_strip_sig_or_bom(iana_encoding: str) -> bool:
return iana_encoding not in {"utf_16", "utf_32"}
def iana_name(cp_name: str, strict: bool = True) -> str:
cp_name = cp_name.lower().replace("-", "_")
encoding_alias: str
encoding_iana: str
for encoding_alias, encoding_iana in aliases.items():
if cp_name in [encoding_alias, encoding_iana]:
return encoding_iana
if strict:
raise ValueError("Unable to retrieve IANA for '{}'".format(cp_name))
return cp_name
def range_scan(decoded_sequence: str) -> List[str]:
ranges: Set[str] = set()
for character in decoded_sequence:
character_range: Optional[str] = unicode_range(character)
if character_range is None:
continue
ranges.add(character_range)
return list(ranges)
def cp_similarity(iana_name_a: str, iana_name_b: str) -> float:
if is_multi_byte_encoding(iana_name_a) or is_multi_byte_encoding(iana_name_b):
return 0.0
decoder_a = importlib.import_module(
"encodings.{}".format(iana_name_a)
).IncrementalDecoder
decoder_b = importlib.import_module(
"encodings.{}".format(iana_name_b)
).IncrementalDecoder
id_a: IncrementalDecoder = decoder_a(errors="ignore")
id_b: IncrementalDecoder = decoder_b(errors="ignore")
character_match_count: int = 0
for i in range(255):
to_be_decoded: bytes = bytes([i])
if id_a.decode(to_be_decoded) == id_b.decode(to_be_decoded):
character_match_count += 1
return character_match_count / 254
def is_cp_similar(iana_name_a: str, iana_name_b: str) -> bool:
"""
Determine if two code page are at least 80% similar. IANA_SUPPORTED_SIMILAR dict was generated using
the function cp_similarity.
"""
return (
iana_name_a in IANA_SUPPORTED_SIMILAR
and iana_name_b in IANA_SUPPORTED_SIMILAR[iana_name_a]
)
def set_logging_handler(
name: str = "charset_normalizer",
level: int = logging.INFO,
format_string: str = "%(asctime)s | %(levelname)s | %(message)s",
) -> None:
logger = logging.getLogger(name)
logger.setLevel(level)
handler = logging.StreamHandler()
handler.setFormatter(logging.Formatter(format_string))
logger.addHandler(handler)
def cut_sequence_chunks(
sequences: bytes,
encoding_iana: str,
offsets: range,
chunk_size: int,
bom_or_sig_available: bool,
strip_sig_or_bom: bool,
sig_payload: bytes,
is_multi_byte_decoder: bool,
decoded_payload: Optional[str] = None,
) -> Generator[str, None, None]:
if decoded_payload and is_multi_byte_decoder is False:
for i in offsets:
chunk = decoded_payload[i : i + chunk_size]
if not chunk:
break
yield chunk
else:
for i in offsets:
chunk_end = i + chunk_size
if chunk_end > len(sequences) + 8:
continue
cut_sequence = sequences[i : i + chunk_size]
if bom_or_sig_available and strip_sig_or_bom is False:
cut_sequence = sig_payload + cut_sequence
chunk = cut_sequence.decode(
encoding_iana,
errors="ignore" if is_multi_byte_decoder else "strict",
)
# multi-byte bad cutting detector and adjustment
# not the cleanest way to perform that fix but clever enough for now.
if is_multi_byte_decoder and i > 0 and sequences[i] >= 0x80:
chunk_partial_size_chk: int = min(chunk_size, 16)
if (
decoded_payload
and chunk[:chunk_partial_size_chk] not in decoded_payload
):
for j in range(i, i - 4, -1):
cut_sequence = sequences[j:chunk_end]
if bom_or_sig_available and strip_sig_or_bom is False:
cut_sequence = sig_payload + cut_sequence
chunk = cut_sequence.decode(encoding_iana, errors="ignore")
if chunk[:chunk_partial_size_chk] in decoded_payload:
break
yield chunk

View File

@ -0,0 +1,6 @@
"""
Expose version
"""
__version__ = "2.1.1"
VERSION = __version__.split(".")

View File

@ -0,0 +1,8 @@
# -*- coding: utf-8 -*-
try:
from ._version import version as __version__
except ImportError:
__version__ = 'unknown'
__all__ = ['easter', 'parser', 'relativedelta', 'rrule', 'tz',
'utils', 'zoneinfo']

View File

@ -0,0 +1,43 @@
"""
Common code used in multiple modules.
"""
class weekday(object):
__slots__ = ["weekday", "n"]
def __init__(self, weekday, n=None):
self.weekday = weekday
self.n = n
def __call__(self, n):
if n == self.n:
return self
else:
return self.__class__(self.weekday, n)
def __eq__(self, other):
try:
if self.weekday != other.weekday or self.n != other.n:
return False
except AttributeError:
return False
return True
def __hash__(self):
return hash((
self.weekday,
self.n,
))
def __ne__(self, other):
return not (self == other)
def __repr__(self):
s = ("MO", "TU", "WE", "TH", "FR", "SA", "SU")[self.weekday]
if not self.n:
return s
else:
return "%s(%+d)" % (s, self.n)
# vim:ts=4:sw=4:et

View File

@ -0,0 +1,5 @@
# coding: utf-8
# file generated by setuptools_scm
# don't change, don't track in version control
version = '2.8.2'
version_tuple = (2, 8, 2)

View File

@ -0,0 +1,89 @@
# -*- coding: utf-8 -*-
"""
This module offers a generic Easter computing method for any given year, using
Western, Orthodox or Julian algorithms.
"""
import datetime
__all__ = ["easter", "EASTER_JULIAN", "EASTER_ORTHODOX", "EASTER_WESTERN"]
EASTER_JULIAN = 1
EASTER_ORTHODOX = 2
EASTER_WESTERN = 3
def easter(year, method=EASTER_WESTERN):
"""
This method was ported from the work done by GM Arts,
on top of the algorithm by Claus Tondering, which was
based in part on the algorithm of Ouding (1940), as
quoted in "Explanatory Supplement to the Astronomical
Almanac", P. Kenneth Seidelmann, editor.
This algorithm implements three different Easter
calculation methods:
1. Original calculation in Julian calendar, valid in
dates after 326 AD
2. Original method, with date converted to Gregorian
calendar, valid in years 1583 to 4099
3. Revised method, in Gregorian calendar, valid in
years 1583 to 4099 as well
These methods are represented by the constants:
* ``EASTER_JULIAN = 1``
* ``EASTER_ORTHODOX = 2``
* ``EASTER_WESTERN = 3``
The default method is method 3.
More about the algorithm may be found at:
`GM Arts: Easter Algorithms <http://www.gmarts.org/index.php?go=415>`_
and
`The Calendar FAQ: Easter <https://www.tondering.dk/claus/cal/easter.php>`_
"""
if not (1 <= method <= 3):
raise ValueError("invalid method")
# g - Golden year - 1
# c - Century
# h - (23 - Epact) mod 30
# i - Number of days from March 21 to Paschal Full Moon
# j - Weekday for PFM (0=Sunday, etc)
# p - Number of days from March 21 to Sunday on or before PFM
# (-6 to 28 methods 1 & 3, to 56 for method 2)
# e - Extra days to add for method 2 (converting Julian
# date to Gregorian date)
y = year
g = y % 19
e = 0
if method < 3:
# Old method
i = (19*g + 15) % 30
j = (y + y//4 + i) % 7
if method == 2:
# Extra dates to convert Julian to Gregorian date
e = 10
if y > 1600:
e = e + y//100 - 16 - (y//100 - 16)//4
else:
# New method
c = y//100
h = (c - c//4 - (8*c + 13)//25 + 19*g + 15) % 30
i = h - (h//28)*(1 - (h//28)*(29//(h + 1))*((21 - g)//11))
j = (y + y//4 + i + 2 - c + c//4) % 7
# p can be from -6 to 56 corresponding to dates 22 March to 23 May
# (later dates apply to method 2, although 23 May never actually occurs)
p = i - j + e
d = 1 + (p + 27 + (p + 6)//40) % 31
m = 3 + (p + 26)//30
return datetime.date(int(y), int(m), int(d))

View File

@ -0,0 +1,61 @@
# -*- coding: utf-8 -*-
from ._parser import parse, parser, parserinfo, ParserError
from ._parser import DEFAULTPARSER, DEFAULTTZPARSER
from ._parser import UnknownTimezoneWarning
from ._parser import __doc__
from .isoparser import isoparser, isoparse
__all__ = ['parse', 'parser', 'parserinfo',
'isoparse', 'isoparser',
'ParserError',
'UnknownTimezoneWarning']
###
# Deprecate portions of the private interface so that downstream code that
# is improperly relying on it is given *some* notice.
def __deprecated_private_func(f):
from functools import wraps
import warnings
msg = ('{name} is a private function and may break without warning, '
'it will be moved and or renamed in future versions.')
msg = msg.format(name=f.__name__)
@wraps(f)
def deprecated_func(*args, **kwargs):
warnings.warn(msg, DeprecationWarning)
return f(*args, **kwargs)
return deprecated_func
def __deprecate_private_class(c):
import warnings
msg = ('{name} is a private class and may break without warning, '
'it will be moved and or renamed in future versions.')
msg = msg.format(name=c.__name__)
class private_class(c):
__doc__ = c.__doc__
def __init__(self, *args, **kwargs):
warnings.warn(msg, DeprecationWarning)
super(private_class, self).__init__(*args, **kwargs)
private_class.__name__ = c.__name__
return private_class
from ._parser import _timelex, _resultbase
from ._parser import _tzparser, _parsetz
_timelex = __deprecate_private_class(_timelex)
_tzparser = __deprecate_private_class(_tzparser)
_resultbase = __deprecate_private_class(_resultbase)
_parsetz = __deprecated_private_func(_parsetz)

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,416 @@
# -*- coding: utf-8 -*-
"""
This module offers a parser for ISO-8601 strings
It is intended to support all valid date, time and datetime formats per the
ISO-8601 specification.
..versionadded:: 2.7.0
"""
from datetime import datetime, timedelta, time, date
import calendar
from dateutil import tz
from functools import wraps
import re
import six
__all__ = ["isoparse", "isoparser"]
def _takes_ascii(f):
@wraps(f)
def func(self, str_in, *args, **kwargs):
# If it's a stream, read the whole thing
str_in = getattr(str_in, 'read', lambda: str_in)()
# If it's unicode, turn it into bytes, since ISO-8601 only covers ASCII
if isinstance(str_in, six.text_type):
# ASCII is the same in UTF-8
try:
str_in = str_in.encode('ascii')
except UnicodeEncodeError as e:
msg = 'ISO-8601 strings should contain only ASCII characters'
six.raise_from(ValueError(msg), e)
return f(self, str_in, *args, **kwargs)
return func
class isoparser(object):
def __init__(self, sep=None):
"""
:param sep:
A single character that separates date and time portions. If
``None``, the parser will accept any single character.
For strict ISO-8601 adherence, pass ``'T'``.
"""
if sep is not None:
if (len(sep) != 1 or ord(sep) >= 128 or sep in '0123456789'):
raise ValueError('Separator must be a single, non-numeric ' +
'ASCII character')
sep = sep.encode('ascii')
self._sep = sep
@_takes_ascii
def isoparse(self, dt_str):
"""
Parse an ISO-8601 datetime string into a :class:`datetime.datetime`.
An ISO-8601 datetime string consists of a date portion, followed
optionally by a time portion - the date and time portions are separated
by a single character separator, which is ``T`` in the official
standard. Incomplete date formats (such as ``YYYY-MM``) may *not* be
combined with a time portion.
Supported date formats are:
Common:
- ``YYYY``
- ``YYYY-MM`` or ``YYYYMM``
- ``YYYY-MM-DD`` or ``YYYYMMDD``
Uncommon:
- ``YYYY-Www`` or ``YYYYWww`` - ISO week (day defaults to 0)
- ``YYYY-Www-D`` or ``YYYYWwwD`` - ISO week and day
The ISO week and day numbering follows the same logic as
:func:`datetime.date.isocalendar`.
Supported time formats are:
- ``hh``
- ``hh:mm`` or ``hhmm``
- ``hh:mm:ss`` or ``hhmmss``
- ``hh:mm:ss.ssssss`` (Up to 6 sub-second digits)
Midnight is a special case for `hh`, as the standard supports both
00:00 and 24:00 as a representation. The decimal separator can be
either a dot or a comma.
.. caution::
Support for fractional components other than seconds is part of the
ISO-8601 standard, but is not currently implemented in this parser.
Supported time zone offset formats are:
- `Z` (UTC)
- `±HH:MM`
- `±HHMM`
- `±HH`
Offsets will be represented as :class:`dateutil.tz.tzoffset` objects,
with the exception of UTC, which will be represented as
:class:`dateutil.tz.tzutc`. Time zone offsets equivalent to UTC (such
as `+00:00`) will also be represented as :class:`dateutil.tz.tzutc`.
:param dt_str:
A string or stream containing only an ISO-8601 datetime string
:return:
Returns a :class:`datetime.datetime` representing the string.
Unspecified components default to their lowest value.
.. warning::
As of version 2.7.0, the strictness of the parser should not be
considered a stable part of the contract. Any valid ISO-8601 string
that parses correctly with the default settings will continue to
parse correctly in future versions, but invalid strings that
currently fail (e.g. ``2017-01-01T00:00+00:00:00``) are not
guaranteed to continue failing in future versions if they encode
a valid date.
.. versionadded:: 2.7.0
"""
components, pos = self._parse_isodate(dt_str)
if len(dt_str) > pos:
if self._sep is None or dt_str[pos:pos + 1] == self._sep:
components += self._parse_isotime(dt_str[pos + 1:])
else:
raise ValueError('String contains unknown ISO components')
if len(components) > 3 and components[3] == 24:
components[3] = 0
return datetime(*components) + timedelta(days=1)
return datetime(*components)
@_takes_ascii
def parse_isodate(self, datestr):
"""
Parse the date portion of an ISO string.
:param datestr:
The string portion of an ISO string, without a separator
:return:
Returns a :class:`datetime.date` object
"""
components, pos = self._parse_isodate(datestr)
if pos < len(datestr):
raise ValueError('String contains unknown ISO ' +
'components: {!r}'.format(datestr.decode('ascii')))
return date(*components)
@_takes_ascii
def parse_isotime(self, timestr):
"""
Parse the time portion of an ISO string.
:param timestr:
The time portion of an ISO string, without a separator
:return:
Returns a :class:`datetime.time` object
"""
components = self._parse_isotime(timestr)
if components[0] == 24:
components[0] = 0
return time(*components)
@_takes_ascii
def parse_tzstr(self, tzstr, zero_as_utc=True):
"""
Parse a valid ISO time zone string.
See :func:`isoparser.isoparse` for details on supported formats.
:param tzstr:
A string representing an ISO time zone offset
:param zero_as_utc:
Whether to return :class:`dateutil.tz.tzutc` for zero-offset zones
:return:
Returns :class:`dateutil.tz.tzoffset` for offsets and
:class:`dateutil.tz.tzutc` for ``Z`` and (if ``zero_as_utc`` is
specified) offsets equivalent to UTC.
"""
return self._parse_tzstr(tzstr, zero_as_utc=zero_as_utc)
# Constants
_DATE_SEP = b'-'
_TIME_SEP = b':'
_FRACTION_REGEX = re.compile(b'[\\.,]([0-9]+)')
def _parse_isodate(self, dt_str):
try:
return self._parse_isodate_common(dt_str)
except ValueError:
return self._parse_isodate_uncommon(dt_str)
def _parse_isodate_common(self, dt_str):
len_str = len(dt_str)
components = [1, 1, 1]
if len_str < 4:
raise ValueError('ISO string too short')
# Year
components[0] = int(dt_str[0:4])
pos = 4
if pos >= len_str:
return components, pos
has_sep = dt_str[pos:pos + 1] == self._DATE_SEP
if has_sep:
pos += 1
# Month
if len_str - pos < 2:
raise ValueError('Invalid common month')
components[1] = int(dt_str[pos:pos + 2])
pos += 2
if pos >= len_str:
if has_sep:
return components, pos
else:
raise ValueError('Invalid ISO format')
if has_sep:
if dt_str[pos:pos + 1] != self._DATE_SEP:
raise ValueError('Invalid separator in ISO string')
pos += 1
# Day
if len_str - pos < 2:
raise ValueError('Invalid common day')
components[2] = int(dt_str[pos:pos + 2])
return components, pos + 2
def _parse_isodate_uncommon(self, dt_str):
if len(dt_str) < 4:
raise ValueError('ISO string too short')
# All ISO formats start with the year
year = int(dt_str[0:4])
has_sep = dt_str[4:5] == self._DATE_SEP
pos = 4 + has_sep # Skip '-' if it's there
if dt_str[pos:pos + 1] == b'W':
# YYYY-?Www-?D?
pos += 1
weekno = int(dt_str[pos:pos + 2])
pos += 2
dayno = 1
if len(dt_str) > pos:
if (dt_str[pos:pos + 1] == self._DATE_SEP) != has_sep:
raise ValueError('Inconsistent use of dash separator')
pos += has_sep
dayno = int(dt_str[pos:pos + 1])
pos += 1
base_date = self._calculate_weekdate(year, weekno, dayno)
else:
# YYYYDDD or YYYY-DDD
if len(dt_str) - pos < 3:
raise ValueError('Invalid ordinal day')
ordinal_day = int(dt_str[pos:pos + 3])
pos += 3
if ordinal_day < 1 or ordinal_day > (365 + calendar.isleap(year)):
raise ValueError('Invalid ordinal day' +
' {} for year {}'.format(ordinal_day, year))
base_date = date(year, 1, 1) + timedelta(days=ordinal_day - 1)
components = [base_date.year, base_date.month, base_date.day]
return components, pos
def _calculate_weekdate(self, year, week, day):
"""
Calculate the day of corresponding to the ISO year-week-day calendar.
This function is effectively the inverse of
:func:`datetime.date.isocalendar`.
:param year:
The year in the ISO calendar
:param week:
The week in the ISO calendar - range is [1, 53]
:param day:
The day in the ISO calendar - range is [1 (MON), 7 (SUN)]
:return:
Returns a :class:`datetime.date`
"""
if not 0 < week < 54:
raise ValueError('Invalid week: {}'.format(week))
if not 0 < day < 8: # Range is 1-7
raise ValueError('Invalid weekday: {}'.format(day))
# Get week 1 for the specific year:
jan_4 = date(year, 1, 4) # Week 1 always has January 4th in it
week_1 = jan_4 - timedelta(days=jan_4.isocalendar()[2] - 1)
# Now add the specific number of weeks and days to get what we want
week_offset = (week - 1) * 7 + (day - 1)
return week_1 + timedelta(days=week_offset)
def _parse_isotime(self, timestr):
len_str = len(timestr)
components = [0, 0, 0, 0, None]
pos = 0
comp = -1
if len_str < 2:
raise ValueError('ISO time too short')
has_sep = False
while pos < len_str and comp < 5:
comp += 1
if timestr[pos:pos + 1] in b'-+Zz':
# Detect time zone boundary
components[-1] = self._parse_tzstr(timestr[pos:])
pos = len_str
break
if comp == 1 and timestr[pos:pos+1] == self._TIME_SEP:
has_sep = True
pos += 1
elif comp == 2 and has_sep:
if timestr[pos:pos+1] != self._TIME_SEP:
raise ValueError('Inconsistent use of colon separator')
pos += 1
if comp < 3:
# Hour, minute, second
components[comp] = int(timestr[pos:pos + 2])
pos += 2
if comp == 3:
# Fraction of a second
frac = self._FRACTION_REGEX.match(timestr[pos:])
if not frac:
continue
us_str = frac.group(1)[:6] # Truncate to microseconds
components[comp] = int(us_str) * 10**(6 - len(us_str))
pos += len(frac.group())
if pos < len_str:
raise ValueError('Unused components in ISO string')
if components[0] == 24:
# Standard supports 00:00 and 24:00 as representations of midnight
if any(component != 0 for component in components[1:4]):
raise ValueError('Hour may only be 24 at 24:00:00.000')
return components
def _parse_tzstr(self, tzstr, zero_as_utc=True):
if tzstr == b'Z' or tzstr == b'z':
return tz.UTC
if len(tzstr) not in {3, 5, 6}:
raise ValueError('Time zone offset must be 1, 3, 5 or 6 characters')
if tzstr[0:1] == b'-':
mult = -1
elif tzstr[0:1] == b'+':
mult = 1
else:
raise ValueError('Time zone offset requires sign')
hours = int(tzstr[1:3])
if len(tzstr) == 3:
minutes = 0
else:
minutes = int(tzstr[(4 if tzstr[3:4] == self._TIME_SEP else 3):])
if zero_as_utc and hours == 0 and minutes == 0:
return tz.UTC
else:
if minutes > 59:
raise ValueError('Invalid minutes in time zone offset')
if hours > 23:
raise ValueError('Invalid hours in time zone offset')
return tz.tzoffset(None, mult * (hours * 60 + minutes) * 60)
DEFAULT_ISOPARSER = isoparser()
isoparse = DEFAULT_ISOPARSER.isoparse

View File

@ -0,0 +1,599 @@
# -*- coding: utf-8 -*-
import datetime
import calendar
import operator
from math import copysign
from six import integer_types
from warnings import warn
from ._common import weekday
MO, TU, WE, TH, FR, SA, SU = weekdays = tuple(weekday(x) for x in range(7))
__all__ = ["relativedelta", "MO", "TU", "WE", "TH", "FR", "SA", "SU"]
class relativedelta(object):
"""
The relativedelta type is designed to be applied to an existing datetime and
can replace specific components of that datetime, or represents an interval
of time.
It is based on the specification of the excellent work done by M.-A. Lemburg
in his
`mx.DateTime <https://www.egenix.com/products/python/mxBase/mxDateTime/>`_ extension.
However, notice that this type does *NOT* implement the same algorithm as
his work. Do *NOT* expect it to behave like mx.DateTime's counterpart.
There are two different ways to build a relativedelta instance. The
first one is passing it two date/datetime classes::
relativedelta(datetime1, datetime2)
The second one is passing it any number of the following keyword arguments::
relativedelta(arg1=x,arg2=y,arg3=z...)
year, month, day, hour, minute, second, microsecond:
Absolute information (argument is singular); adding or subtracting a
relativedelta with absolute information does not perform an arithmetic
operation, but rather REPLACES the corresponding value in the
original datetime with the value(s) in relativedelta.
years, months, weeks, days, hours, minutes, seconds, microseconds:
Relative information, may be negative (argument is plural); adding
or subtracting a relativedelta with relative information performs
the corresponding arithmetic operation on the original datetime value
with the information in the relativedelta.
weekday:
One of the weekday instances (MO, TU, etc) available in the
relativedelta module. These instances may receive a parameter N,
specifying the Nth weekday, which could be positive or negative
(like MO(+1) or MO(-2)). Not specifying it is the same as specifying
+1. You can also use an integer, where 0=MO. This argument is always
relative e.g. if the calculated date is already Monday, using MO(1)
or MO(-1) won't change the day. To effectively make it absolute, use
it in combination with the day argument (e.g. day=1, MO(1) for first
Monday of the month).
leapdays:
Will add given days to the date found, if year is a leap
year, and the date found is post 28 of february.
yearday, nlyearday:
Set the yearday or the non-leap year day (jump leap days).
These are converted to day/month/leapdays information.
There are relative and absolute forms of the keyword
arguments. The plural is relative, and the singular is
absolute. For each argument in the order below, the absolute form
is applied first (by setting each attribute to that value) and
then the relative form (by adding the value to the attribute).
The order of attributes considered when this relativedelta is
added to a datetime is:
1. Year
2. Month
3. Day
4. Hours
5. Minutes
6. Seconds
7. Microseconds
Finally, weekday is applied, using the rule described above.
For example
>>> from datetime import datetime
>>> from dateutil.relativedelta import relativedelta, MO
>>> dt = datetime(2018, 4, 9, 13, 37, 0)
>>> delta = relativedelta(hours=25, day=1, weekday=MO(1))
>>> dt + delta
datetime.datetime(2018, 4, 2, 14, 37)
First, the day is set to 1 (the first of the month), then 25 hours
are added, to get to the 2nd day and 14th hour, finally the
weekday is applied, but since the 2nd is already a Monday there is
no effect.
"""
def __init__(self, dt1=None, dt2=None,
years=0, months=0, days=0, leapdays=0, weeks=0,
hours=0, minutes=0, seconds=0, microseconds=0,
year=None, month=None, day=None, weekday=None,
yearday=None, nlyearday=None,
hour=None, minute=None, second=None, microsecond=None):
if dt1 and dt2:
# datetime is a subclass of date. So both must be date
if not (isinstance(dt1, datetime.date) and
isinstance(dt2, datetime.date)):
raise TypeError("relativedelta only diffs datetime/date")
# We allow two dates, or two datetimes, so we coerce them to be
# of the same type
if (isinstance(dt1, datetime.datetime) !=
isinstance(dt2, datetime.datetime)):
if not isinstance(dt1, datetime.datetime):
dt1 = datetime.datetime.fromordinal(dt1.toordinal())
elif not isinstance(dt2, datetime.datetime):
dt2 = datetime.datetime.fromordinal(dt2.toordinal())
self.years = 0
self.months = 0
self.days = 0
self.leapdays = 0
self.hours = 0
self.minutes = 0
self.seconds = 0
self.microseconds = 0
self.year = None
self.month = None
self.day = None
self.weekday = None
self.hour = None
self.minute = None
self.second = None
self.microsecond = None
self._has_time = 0
# Get year / month delta between the two
months = (dt1.year - dt2.year) * 12 + (dt1.month - dt2.month)
self._set_months(months)
# Remove the year/month delta so the timedelta is just well-defined
# time units (seconds, days and microseconds)
dtm = self.__radd__(dt2)
# If we've overshot our target, make an adjustment
if dt1 < dt2:
compare = operator.gt
increment = 1
else:
compare = operator.lt
increment = -1
while compare(dt1, dtm):
months += increment
self._set_months(months)
dtm = self.__radd__(dt2)
# Get the timedelta between the "months-adjusted" date and dt1
delta = dt1 - dtm
self.seconds = delta.seconds + delta.days * 86400
self.microseconds = delta.microseconds
else:
# Check for non-integer values in integer-only quantities
if any(x is not None and x != int(x) for x in (years, months)):
raise ValueError("Non-integer years and months are "
"ambiguous and not currently supported.")
# Relative information
self.years = int(years)
self.months = int(months)
self.days = days + weeks * 7
self.leapdays = leapdays
self.hours = hours
self.minutes = minutes
self.seconds = seconds
self.microseconds = microseconds
# Absolute information
self.year = year
self.month = month
self.day = day
self.hour = hour
self.minute = minute
self.second = second
self.microsecond = microsecond
if any(x is not None and int(x) != x
for x in (year, month, day, hour,
minute, second, microsecond)):
# For now we'll deprecate floats - later it'll be an error.
warn("Non-integer value passed as absolute information. " +
"This is not a well-defined condition and will raise " +
"errors in future versions.", DeprecationWarning)
if isinstance(weekday, integer_types):
self.weekday = weekdays[weekday]
else:
self.weekday = weekday
yday = 0
if nlyearday:
yday = nlyearday
elif yearday:
yday = yearday
if yearday > 59:
self.leapdays = -1
if yday:
ydayidx = [31, 59, 90, 120, 151, 181, 212,
243, 273, 304, 334, 366]
for idx, ydays in enumerate(ydayidx):
if yday <= ydays:
self.month = idx+1
if idx == 0:
self.day = yday
else:
self.day = yday-ydayidx[idx-1]
break
else:
raise ValueError("invalid year day (%d)" % yday)
self._fix()
def _fix(self):
if abs(self.microseconds) > 999999:
s = _sign(self.microseconds)
div, mod = divmod(self.microseconds * s, 1000000)
self.microseconds = mod * s
self.seconds += div * s
if abs(self.seconds) > 59:
s = _sign(self.seconds)
div, mod = divmod(self.seconds * s, 60)
self.seconds = mod * s
self.minutes += div * s
if abs(self.minutes) > 59:
s = _sign(self.minutes)
div, mod = divmod(self.minutes * s, 60)
self.minutes = mod * s
self.hours += div * s
if abs(self.hours) > 23:
s = _sign(self.hours)
div, mod = divmod(self.hours * s, 24)
self.hours = mod * s
self.days += div * s
if abs(self.months) > 11:
s = _sign(self.months)
div, mod = divmod(self.months * s, 12)
self.months = mod * s
self.years += div * s
if (self.hours or self.minutes or self.seconds or self.microseconds
or self.hour is not None or self.minute is not None or
self.second is not None or self.microsecond is not None):
self._has_time = 1
else:
self._has_time = 0
@property
def weeks(self):
return int(self.days / 7.0)
@weeks.setter
def weeks(self, value):
self.days = self.days - (self.weeks * 7) + value * 7
def _set_months(self, months):
self.months = months
if abs(self.months) > 11:
s = _sign(self.months)
div, mod = divmod(self.months * s, 12)
self.months = mod * s
self.years = div * s
else:
self.years = 0
def normalized(self):
"""
Return a version of this object represented entirely using integer
values for the relative attributes.
>>> relativedelta(days=1.5, hours=2).normalized()
relativedelta(days=+1, hours=+14)
:return:
Returns a :class:`dateutil.relativedelta.relativedelta` object.
"""
# Cascade remainders down (rounding each to roughly nearest microsecond)
days = int(self.days)
hours_f = round(self.hours + 24 * (self.days - days), 11)
hours = int(hours_f)
minutes_f = round(self.minutes + 60 * (hours_f - hours), 10)
minutes = int(minutes_f)
seconds_f = round(self.seconds + 60 * (minutes_f - minutes), 8)
seconds = int(seconds_f)
microseconds = round(self.microseconds + 1e6 * (seconds_f - seconds))
# Constructor carries overflow back up with call to _fix()
return self.__class__(years=self.years, months=self.months,
days=days, hours=hours, minutes=minutes,
seconds=seconds, microseconds=microseconds,
leapdays=self.leapdays, year=self.year,
month=self.month, day=self.day,
weekday=self.weekday, hour=self.hour,
minute=self.minute, second=self.second,
microsecond=self.microsecond)
def __add__(self, other):
if isinstance(other, relativedelta):
return self.__class__(years=other.years + self.years,
months=other.months + self.months,
days=other.days + self.days,
hours=other.hours + self.hours,
minutes=other.minutes + self.minutes,
seconds=other.seconds + self.seconds,
microseconds=(other.microseconds +
self.microseconds),
leapdays=other.leapdays or self.leapdays,
year=(other.year if other.year is not None
else self.year),
month=(other.month if other.month is not None
else self.month),
day=(other.day if other.day is not None
else self.day),
weekday=(other.weekday if other.weekday is not None
else self.weekday),
hour=(other.hour if other.hour is not None
else self.hour),
minute=(other.minute if other.minute is not None
else self.minute),
second=(other.second if other.second is not None
else self.second),
microsecond=(other.microsecond if other.microsecond
is not None else
self.microsecond))
if isinstance(other, datetime.timedelta):
return self.__class__(years=self.years,
months=self.months,
days=self.days + other.days,
hours=self.hours,
minutes=self.minutes,
seconds=self.seconds + other.seconds,
microseconds=self.microseconds + other.microseconds,
leapdays=self.leapdays,
year=self.year,
month=self.month,
day=self.day,
weekday=self.weekday,
hour=self.hour,
minute=self.minute,
second=self.second,
microsecond=self.microsecond)
if not isinstance(other, datetime.date):
return NotImplemented
elif self._has_time and not isinstance(other, datetime.datetime):
other = datetime.datetime.fromordinal(other.toordinal())
year = (self.year or other.year)+self.years
month = self.month or other.month
if self.months:
assert 1 <= abs(self.months) <= 12
month += self.months
if month > 12:
year += 1
month -= 12
elif month < 1:
year -= 1
month += 12
day = min(calendar.monthrange(year, month)[1],
self.day or other.day)
repl = {"year": year, "month": month, "day": day}
for attr in ["hour", "minute", "second", "microsecond"]:
value = getattr(self, attr)
if value is not None:
repl[attr] = value
days = self.days
if self.leapdays and month > 2 and calendar.isleap(year):
days += self.leapdays
ret = (other.replace(**repl)
+ datetime.timedelta(days=days,
hours=self.hours,
minutes=self.minutes,
seconds=self.seconds,
microseconds=self.microseconds))
if self.weekday:
weekday, nth = self.weekday.weekday, self.weekday.n or 1
jumpdays = (abs(nth) - 1) * 7
if nth > 0:
jumpdays += (7 - ret.weekday() + weekday) % 7
else:
jumpdays += (ret.weekday() - weekday) % 7
jumpdays *= -1
ret += datetime.timedelta(days=jumpdays)
return ret
def __radd__(self, other):
return self.__add__(other)
def __rsub__(self, other):
return self.__neg__().__radd__(other)
def __sub__(self, other):
if not isinstance(other, relativedelta):
return NotImplemented # In case the other object defines __rsub__
return self.__class__(years=self.years - other.years,
months=self.months - other.months,
days=self.days - other.days,
hours=self.hours - other.hours,
minutes=self.minutes - other.minutes,
seconds=self.seconds - other.seconds,
microseconds=self.microseconds - other.microseconds,
leapdays=self.leapdays or other.leapdays,
year=(self.year if self.year is not None
else other.year),
month=(self.month if self.month is not None else
other.month),
day=(self.day if self.day is not None else
other.day),
weekday=(self.weekday if self.weekday is not None else
other.weekday),
hour=(self.hour if self.hour is not None else
other.hour),
minute=(self.minute if self.minute is not None else
other.minute),
second=(self.second if self.second is not None else
other.second),
microsecond=(self.microsecond if self.microsecond
is not None else
other.microsecond))
def __abs__(self):
return self.__class__(years=abs(self.years),
months=abs(self.months),
days=abs(self.days),
hours=abs(self.hours),
minutes=abs(self.minutes),
seconds=abs(self.seconds),
microseconds=abs(self.microseconds),
leapdays=self.leapdays,
year=self.year,
month=self.month,
day=self.day,
weekday=self.weekday,
hour=self.hour,
minute=self.minute,
second=self.second,
microsecond=self.microsecond)
def __neg__(self):
return self.__class__(years=-self.years,
months=-self.months,
days=-self.days,
hours=-self.hours,
minutes=-self.minutes,
seconds=-self.seconds,
microseconds=-self.microseconds,
leapdays=self.leapdays,
year=self.year,
month=self.month,
day=self.day,
weekday=self.weekday,
hour=self.hour,
minute=self.minute,
second=self.second,
microsecond=self.microsecond)
def __bool__(self):
return not (not self.years and
not self.months and
not self.days and
not self.hours and
not self.minutes and
not self.seconds and
not self.microseconds and
not self.leapdays and
self.year is None and
self.month is None and
self.day is None and
self.weekday is None and
self.hour is None and
self.minute is None and
self.second is None and
self.microsecond is None)
# Compatibility with Python 2.x
__nonzero__ = __bool__
def __mul__(self, other):
try:
f = float(other)
except TypeError:
return NotImplemented
return self.__class__(years=int(self.years * f),
months=int(self.months * f),
days=int(self.days * f),
hours=int(self.hours * f),
minutes=int(self.minutes * f),
seconds=int(self.seconds * f),
microseconds=int(self.microseconds * f),
leapdays=self.leapdays,
year=self.year,
month=self.month,
day=self.day,
weekday=self.weekday,
hour=self.hour,
minute=self.minute,
second=self.second,
microsecond=self.microsecond)
__rmul__ = __mul__
def __eq__(self, other):
if not isinstance(other, relativedelta):
return NotImplemented
if self.weekday or other.weekday:
if not self.weekday or not other.weekday:
return False
if self.weekday.weekday != other.weekday.weekday:
return False
n1, n2 = self.weekday.n, other.weekday.n
if n1 != n2 and not ((not n1 or n1 == 1) and (not n2 or n2 == 1)):
return False
return (self.years == other.years and
self.months == other.months and
self.days == other.days and
self.hours == other.hours and
self.minutes == other.minutes and
self.seconds == other.seconds and
self.microseconds == other.microseconds and
self.leapdays == other.leapdays and
self.year == other.year and
self.month == other.month and
self.day == other.day and
self.hour == other.hour and
self.minute == other.minute and
self.second == other.second and
self.microsecond == other.microsecond)
def __hash__(self):
return hash((
self.weekday,
self.years,
self.months,
self.days,
self.hours,
self.minutes,
self.seconds,
self.microseconds,
self.leapdays,
self.year,
self.month,
self.day,
self.hour,
self.minute,
self.second,
self.microsecond,
))
def __ne__(self, other):
return not self.__eq__(other)
def __div__(self, other):
try:
reciprocal = 1 / float(other)
except TypeError:
return NotImplemented
return self.__mul__(reciprocal)
__truediv__ = __div__
def __repr__(self):
l = []
for attr in ["years", "months", "days", "leapdays",
"hours", "minutes", "seconds", "microseconds"]:
value = getattr(self, attr)
if value:
l.append("{attr}={value:+g}".format(attr=attr, value=value))
for attr in ["year", "month", "day", "weekday",
"hour", "minute", "second", "microsecond"]:
value = getattr(self, attr)
if value is not None:
l.append("{attr}={value}".format(attr=attr, value=repr(value)))
return "{classname}({attrs})".format(classname=self.__class__.__name__,
attrs=", ".join(l))
def _sign(x):
return int(copysign(1, x))
# vim:ts=4:sw=4:et

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,12 @@
# -*- coding: utf-8 -*-
from .tz import *
from .tz import __doc__
__all__ = ["tzutc", "tzoffset", "tzlocal", "tzfile", "tzrange",
"tzstr", "tzical", "tzwin", "tzwinlocal", "gettz",
"enfold", "datetime_ambiguous", "datetime_exists",
"resolve_imaginary", "UTC", "DeprecatedTzFormatWarning"]
class DeprecatedTzFormatWarning(Warning):
"""Warning raised when time zones are parsed from deprecated formats."""

View File

@ -0,0 +1,419 @@
from six import PY2
from functools import wraps
from datetime import datetime, timedelta, tzinfo
ZERO = timedelta(0)
__all__ = ['tzname_in_python2', 'enfold']
def tzname_in_python2(namefunc):
"""Change unicode output into bytestrings in Python 2
tzname() API changed in Python 3. It used to return bytes, but was changed
to unicode strings
"""
if PY2:
@wraps(namefunc)
def adjust_encoding(*args, **kwargs):
name = namefunc(*args, **kwargs)
if name is not None:
name = name.encode()
return name
return adjust_encoding
else:
return namefunc
# The following is adapted from Alexander Belopolsky's tz library
# https://github.com/abalkin/tz
if hasattr(datetime, 'fold'):
# This is the pre-python 3.6 fold situation
def enfold(dt, fold=1):
"""
Provides a unified interface for assigning the ``fold`` attribute to
datetimes both before and after the implementation of PEP-495.
:param fold:
The value for the ``fold`` attribute in the returned datetime. This
should be either 0 or 1.
:return:
Returns an object for which ``getattr(dt, 'fold', 0)`` returns
``fold`` for all versions of Python. In versions prior to
Python 3.6, this is a ``_DatetimeWithFold`` object, which is a
subclass of :py:class:`datetime.datetime` with the ``fold``
attribute added, if ``fold`` is 1.
.. versionadded:: 2.6.0
"""
return dt.replace(fold=fold)
else:
class _DatetimeWithFold(datetime):
"""
This is a class designed to provide a PEP 495-compliant interface for
Python versions before 3.6. It is used only for dates in a fold, so
the ``fold`` attribute is fixed at ``1``.
.. versionadded:: 2.6.0
"""
__slots__ = ()
def replace(self, *args, **kwargs):
"""
Return a datetime with the same attributes, except for those
attributes given new values by whichever keyword arguments are
specified. Note that tzinfo=None can be specified to create a naive
datetime from an aware datetime with no conversion of date and time
data.
This is reimplemented in ``_DatetimeWithFold`` because pypy3 will
return a ``datetime.datetime`` even if ``fold`` is unchanged.
"""
argnames = (
'year', 'month', 'day', 'hour', 'minute', 'second',
'microsecond', 'tzinfo'
)
for arg, argname in zip(args, argnames):
if argname in kwargs:
raise TypeError('Duplicate argument: {}'.format(argname))
kwargs[argname] = arg
for argname in argnames:
if argname not in kwargs:
kwargs[argname] = getattr(self, argname)
dt_class = self.__class__ if kwargs.get('fold', 1) else datetime
return dt_class(**kwargs)
@property
def fold(self):
return 1
def enfold(dt, fold=1):
"""
Provides a unified interface for assigning the ``fold`` attribute to
datetimes both before and after the implementation of PEP-495.
:param fold:
The value for the ``fold`` attribute in the returned datetime. This
should be either 0 or 1.
:return:
Returns an object for which ``getattr(dt, 'fold', 0)`` returns
``fold`` for all versions of Python. In versions prior to
Python 3.6, this is a ``_DatetimeWithFold`` object, which is a
subclass of :py:class:`datetime.datetime` with the ``fold``
attribute added, if ``fold`` is 1.
.. versionadded:: 2.6.0
"""
if getattr(dt, 'fold', 0) == fold:
return dt
args = dt.timetuple()[:6]
args += (dt.microsecond, dt.tzinfo)
if fold:
return _DatetimeWithFold(*args)
else:
return datetime(*args)
def _validate_fromutc_inputs(f):
"""
The CPython version of ``fromutc`` checks that the input is a ``datetime``
object and that ``self`` is attached as its ``tzinfo``.
"""
@wraps(f)
def fromutc(self, dt):
if not isinstance(dt, datetime):
raise TypeError("fromutc() requires a datetime argument")
if dt.tzinfo is not self:
raise ValueError("dt.tzinfo is not self")
return f(self, dt)
return fromutc
class _tzinfo(tzinfo):
"""
Base class for all ``dateutil`` ``tzinfo`` objects.
"""
def is_ambiguous(self, dt):
"""
Whether or not the "wall time" of a given datetime is ambiguous in this
zone.
:param dt:
A :py:class:`datetime.datetime`, naive or time zone aware.
:return:
Returns ``True`` if ambiguous, ``False`` otherwise.
.. versionadded:: 2.6.0
"""
dt = dt.replace(tzinfo=self)
wall_0 = enfold(dt, fold=0)
wall_1 = enfold(dt, fold=1)
same_offset = wall_0.utcoffset() == wall_1.utcoffset()
same_dt = wall_0.replace(tzinfo=None) == wall_1.replace(tzinfo=None)
return same_dt and not same_offset
def _fold_status(self, dt_utc, dt_wall):
"""
Determine the fold status of a "wall" datetime, given a representation
of the same datetime as a (naive) UTC datetime. This is calculated based
on the assumption that ``dt.utcoffset() - dt.dst()`` is constant for all
datetimes, and that this offset is the actual number of hours separating
``dt_utc`` and ``dt_wall``.
:param dt_utc:
Representation of the datetime as UTC
:param dt_wall:
Representation of the datetime as "wall time". This parameter must
either have a `fold` attribute or have a fold-naive
:class:`datetime.tzinfo` attached, otherwise the calculation may
fail.
"""
if self.is_ambiguous(dt_wall):
delta_wall = dt_wall - dt_utc
_fold = int(delta_wall == (dt_utc.utcoffset() - dt_utc.dst()))
else:
_fold = 0
return _fold
def _fold(self, dt):
return getattr(dt, 'fold', 0)
def _fromutc(self, dt):
"""
Given a timezone-aware datetime in a given timezone, calculates a
timezone-aware datetime in a new timezone.
Since this is the one time that we *know* we have an unambiguous
datetime object, we take this opportunity to determine whether the
datetime is ambiguous and in a "fold" state (e.g. if it's the first
occurrence, chronologically, of the ambiguous datetime).
:param dt:
A timezone-aware :class:`datetime.datetime` object.
"""
# Re-implement the algorithm from Python's datetime.py
dtoff = dt.utcoffset()
if dtoff is None:
raise ValueError("fromutc() requires a non-None utcoffset() "
"result")
# The original datetime.py code assumes that `dst()` defaults to
# zero during ambiguous times. PEP 495 inverts this presumption, so
# for pre-PEP 495 versions of python, we need to tweak the algorithm.
dtdst = dt.dst()
if dtdst is None:
raise ValueError("fromutc() requires a non-None dst() result")
delta = dtoff - dtdst
dt += delta
# Set fold=1 so we can default to being in the fold for
# ambiguous dates.
dtdst = enfold(dt, fold=1).dst()
if dtdst is None:
raise ValueError("fromutc(): dt.dst gave inconsistent "
"results; cannot convert")
return dt + dtdst
@_validate_fromutc_inputs
def fromutc(self, dt):
"""
Given a timezone-aware datetime in a given timezone, calculates a
timezone-aware datetime in a new timezone.
Since this is the one time that we *know* we have an unambiguous
datetime object, we take this opportunity to determine whether the
datetime is ambiguous and in a "fold" state (e.g. if it's the first
occurrence, chronologically, of the ambiguous datetime).
:param dt:
A timezone-aware :class:`datetime.datetime` object.
"""
dt_wall = self._fromutc(dt)
# Calculate the fold status given the two datetimes.
_fold = self._fold_status(dt, dt_wall)
# Set the default fold value for ambiguous dates
return enfold(dt_wall, fold=_fold)
class tzrangebase(_tzinfo):
"""
This is an abstract base class for time zones represented by an annual
transition into and out of DST. Child classes should implement the following
methods:
* ``__init__(self, *args, **kwargs)``
* ``transitions(self, year)`` - this is expected to return a tuple of
datetimes representing the DST on and off transitions in standard
time.
A fully initialized ``tzrangebase`` subclass should also provide the
following attributes:
* ``hasdst``: Boolean whether or not the zone uses DST.
* ``_dst_offset`` / ``_std_offset``: :class:`datetime.timedelta` objects
representing the respective UTC offsets.
* ``_dst_abbr`` / ``_std_abbr``: Strings representing the timezone short
abbreviations in DST and STD, respectively.
* ``_hasdst``: Whether or not the zone has DST.
.. versionadded:: 2.6.0
"""
def __init__(self):
raise NotImplementedError('tzrangebase is an abstract base class')
def utcoffset(self, dt):
isdst = self._isdst(dt)
if isdst is None:
return None
elif isdst:
return self._dst_offset
else:
return self._std_offset
def dst(self, dt):
isdst = self._isdst(dt)
if isdst is None:
return None
elif isdst:
return self._dst_base_offset
else:
return ZERO
@tzname_in_python2
def tzname(self, dt):
if self._isdst(dt):
return self._dst_abbr
else:
return self._std_abbr
def fromutc(self, dt):
""" Given a datetime in UTC, return local time """
if not isinstance(dt, datetime):
raise TypeError("fromutc() requires a datetime argument")
if dt.tzinfo is not self:
raise ValueError("dt.tzinfo is not self")
# Get transitions - if there are none, fixed offset
transitions = self.transitions(dt.year)
if transitions is None:
return dt + self.utcoffset(dt)
# Get the transition times in UTC
dston, dstoff = transitions
dston -= self._std_offset
dstoff -= self._std_offset
utc_transitions = (dston, dstoff)
dt_utc = dt.replace(tzinfo=None)
isdst = self._naive_isdst(dt_utc, utc_transitions)
if isdst:
dt_wall = dt + self._dst_offset
else:
dt_wall = dt + self._std_offset
_fold = int(not isdst and self.is_ambiguous(dt_wall))
return enfold(dt_wall, fold=_fold)
def is_ambiguous(self, dt):
"""
Whether or not the "wall time" of a given datetime is ambiguous in this
zone.
:param dt:
A :py:class:`datetime.datetime`, naive or time zone aware.
:return:
Returns ``True`` if ambiguous, ``False`` otherwise.
.. versionadded:: 2.6.0
"""
if not self.hasdst:
return False
start, end = self.transitions(dt.year)
dt = dt.replace(tzinfo=None)
return (end <= dt < end + self._dst_base_offset)
def _isdst(self, dt):
if not self.hasdst:
return False
elif dt is None:
return None
transitions = self.transitions(dt.year)
if transitions is None:
return False
dt = dt.replace(tzinfo=None)
isdst = self._naive_isdst(dt, transitions)
# Handle ambiguous dates
if not isdst and self.is_ambiguous(dt):
return not self._fold(dt)
else:
return isdst
def _naive_isdst(self, dt, transitions):
dston, dstoff = transitions
dt = dt.replace(tzinfo=None)
if dston < dstoff:
isdst = dston <= dt < dstoff
else:
isdst = not dstoff <= dt < dston
return isdst
@property
def _dst_base_offset(self):
return self._dst_offset - self._std_offset
__hash__ = None
def __ne__(self, other):
return not (self == other)
def __repr__(self):
return "%s(...)" % self.__class__.__name__
__reduce__ = object.__reduce__

View File

@ -0,0 +1,80 @@
from datetime import timedelta
import weakref
from collections import OrderedDict
from six.moves import _thread
class _TzSingleton(type):
def __init__(cls, *args, **kwargs):
cls.__instance = None
super(_TzSingleton, cls).__init__(*args, **kwargs)
def __call__(cls):
if cls.__instance is None:
cls.__instance = super(_TzSingleton, cls).__call__()
return cls.__instance
class _TzFactory(type):
def instance(cls, *args, **kwargs):
"""Alternate constructor that returns a fresh instance"""
return type.__call__(cls, *args, **kwargs)
class _TzOffsetFactory(_TzFactory):
def __init__(cls, *args, **kwargs):
cls.__instances = weakref.WeakValueDictionary()
cls.__strong_cache = OrderedDict()
cls.__strong_cache_size = 8
cls._cache_lock = _thread.allocate_lock()
def __call__(cls, name, offset):
if isinstance(offset, timedelta):
key = (name, offset.total_seconds())
else:
key = (name, offset)
instance = cls.__instances.get(key, None)
if instance is None:
instance = cls.__instances.setdefault(key,
cls.instance(name, offset))
# This lock may not be necessary in Python 3. See GH issue #901
with cls._cache_lock:
cls.__strong_cache[key] = cls.__strong_cache.pop(key, instance)
# Remove an item if the strong cache is overpopulated
if len(cls.__strong_cache) > cls.__strong_cache_size:
cls.__strong_cache.popitem(last=False)
return instance
class _TzStrFactory(_TzFactory):
def __init__(cls, *args, **kwargs):
cls.__instances = weakref.WeakValueDictionary()
cls.__strong_cache = OrderedDict()
cls.__strong_cache_size = 8
cls.__cache_lock = _thread.allocate_lock()
def __call__(cls, s, posix_offset=False):
key = (s, posix_offset)
instance = cls.__instances.get(key, None)
if instance is None:
instance = cls.__instances.setdefault(key,
cls.instance(s, posix_offset))
# This lock may not be necessary in Python 3. See GH issue #901
with cls.__cache_lock:
cls.__strong_cache[key] = cls.__strong_cache.pop(key, instance)
# Remove an item if the strong cache is overpopulated
if len(cls.__strong_cache) > cls.__strong_cache_size:
cls.__strong_cache.popitem(last=False)
return instance

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,370 @@
# -*- coding: utf-8 -*-
"""
This module provides an interface to the native time zone data on Windows,
including :py:class:`datetime.tzinfo` implementations.
Attempting to import this module on a non-Windows platform will raise an
:py:obj:`ImportError`.
"""
# This code was originally contributed by Jeffrey Harris.
import datetime
import struct
from six.moves import winreg
from six import text_type
try:
import ctypes
from ctypes import wintypes
except ValueError:
# ValueError is raised on non-Windows systems for some horrible reason.
raise ImportError("Running tzwin on non-Windows system")
from ._common import tzrangebase
__all__ = ["tzwin", "tzwinlocal", "tzres"]
ONEWEEK = datetime.timedelta(7)
TZKEYNAMENT = r"SOFTWARE\Microsoft\Windows NT\CurrentVersion\Time Zones"
TZKEYNAME9X = r"SOFTWARE\Microsoft\Windows\CurrentVersion\Time Zones"
TZLOCALKEYNAME = r"SYSTEM\CurrentControlSet\Control\TimeZoneInformation"
def _settzkeyname():
handle = winreg.ConnectRegistry(None, winreg.HKEY_LOCAL_MACHINE)
try:
winreg.OpenKey(handle, TZKEYNAMENT).Close()
TZKEYNAME = TZKEYNAMENT
except WindowsError:
TZKEYNAME = TZKEYNAME9X
handle.Close()
return TZKEYNAME
TZKEYNAME = _settzkeyname()
class tzres(object):
"""
Class for accessing ``tzres.dll``, which contains timezone name related
resources.
.. versionadded:: 2.5.0
"""
p_wchar = ctypes.POINTER(wintypes.WCHAR) # Pointer to a wide char
def __init__(self, tzres_loc='tzres.dll'):
# Load the user32 DLL so we can load strings from tzres
user32 = ctypes.WinDLL('user32')
# Specify the LoadStringW function
user32.LoadStringW.argtypes = (wintypes.HINSTANCE,
wintypes.UINT,
wintypes.LPWSTR,
ctypes.c_int)
self.LoadStringW = user32.LoadStringW
self._tzres = ctypes.WinDLL(tzres_loc)
self.tzres_loc = tzres_loc
def load_name(self, offset):
"""
Load a timezone name from a DLL offset (integer).
>>> from dateutil.tzwin import tzres
>>> tzr = tzres()
>>> print(tzr.load_name(112))
'Eastern Standard Time'
:param offset:
A positive integer value referring to a string from the tzres dll.
.. note::
Offsets found in the registry are generally of the form
``@tzres.dll,-114``. The offset in this case is 114, not -114.
"""
resource = self.p_wchar()
lpBuffer = ctypes.cast(ctypes.byref(resource), wintypes.LPWSTR)
nchar = self.LoadStringW(self._tzres._handle, offset, lpBuffer, 0)
return resource[:nchar]
def name_from_string(self, tzname_str):
"""
Parse strings as returned from the Windows registry into the time zone
name as defined in the registry.
>>> from dateutil.tzwin import tzres
>>> tzr = tzres()
>>> print(tzr.name_from_string('@tzres.dll,-251'))
'Dateline Daylight Time'
>>> print(tzr.name_from_string('Eastern Standard Time'))
'Eastern Standard Time'
:param tzname_str:
A timezone name string as returned from a Windows registry key.
:return:
Returns the localized timezone string from tzres.dll if the string
is of the form `@tzres.dll,-offset`, else returns the input string.
"""
if not tzname_str.startswith('@'):
return tzname_str
name_splt = tzname_str.split(',-')
try:
offset = int(name_splt[1])
except:
raise ValueError("Malformed timezone string.")
return self.load_name(offset)
class tzwinbase(tzrangebase):
"""tzinfo class based on win32's timezones available in the registry."""
def __init__(self):
raise NotImplementedError('tzwinbase is an abstract base class')
def __eq__(self, other):
# Compare on all relevant dimensions, including name.
if not isinstance(other, tzwinbase):
return NotImplemented
return (self._std_offset == other._std_offset and
self._dst_offset == other._dst_offset and
self._stddayofweek == other._stddayofweek and
self._dstdayofweek == other._dstdayofweek and
self._stdweeknumber == other._stdweeknumber and
self._dstweeknumber == other._dstweeknumber and
self._stdhour == other._stdhour and
self._dsthour == other._dsthour and
self._stdminute == other._stdminute and
self._dstminute == other._dstminute and
self._std_abbr == other._std_abbr and
self._dst_abbr == other._dst_abbr)
@staticmethod
def list():
"""Return a list of all time zones known to the system."""
with winreg.ConnectRegistry(None, winreg.HKEY_LOCAL_MACHINE) as handle:
with winreg.OpenKey(handle, TZKEYNAME) as tzkey:
result = [winreg.EnumKey(tzkey, i)
for i in range(winreg.QueryInfoKey(tzkey)[0])]
return result
def display(self):
"""
Return the display name of the time zone.
"""
return self._display
def transitions(self, year):
"""
For a given year, get the DST on and off transition times, expressed
always on the standard time side. For zones with no transitions, this
function returns ``None``.
:param year:
The year whose transitions you would like to query.
:return:
Returns a :class:`tuple` of :class:`datetime.datetime` objects,
``(dston, dstoff)`` for zones with an annual DST transition, or
``None`` for fixed offset zones.
"""
if not self.hasdst:
return None
dston = picknthweekday(year, self._dstmonth, self._dstdayofweek,
self._dsthour, self._dstminute,
self._dstweeknumber)
dstoff = picknthweekday(year, self._stdmonth, self._stddayofweek,
self._stdhour, self._stdminute,
self._stdweeknumber)
# Ambiguous dates default to the STD side
dstoff -= self._dst_base_offset
return dston, dstoff
def _get_hasdst(self):
return self._dstmonth != 0
@property
def _dst_base_offset(self):
return self._dst_base_offset_
class tzwin(tzwinbase):
"""
Time zone object created from the zone info in the Windows registry
These are similar to :py:class:`dateutil.tz.tzrange` objects in that
the time zone data is provided in the format of a single offset rule
for either 0 or 2 time zone transitions per year.
:param: name
The name of a Windows time zone key, e.g. "Eastern Standard Time".
The full list of keys can be retrieved with :func:`tzwin.list`.
"""
def __init__(self, name):
self._name = name
with winreg.ConnectRegistry(None, winreg.HKEY_LOCAL_MACHINE) as handle:
tzkeyname = text_type("{kn}\\{name}").format(kn=TZKEYNAME, name=name)
with winreg.OpenKey(handle, tzkeyname) as tzkey:
keydict = valuestodict(tzkey)
self._std_abbr = keydict["Std"]
self._dst_abbr = keydict["Dlt"]
self._display = keydict["Display"]
# See http://ww_winreg.jsiinc.com/SUBA/tip0300/rh0398.htm
tup = struct.unpack("=3l16h", keydict["TZI"])
stdoffset = -tup[0]-tup[1] # Bias + StandardBias * -1
dstoffset = stdoffset-tup[2] # + DaylightBias * -1
self._std_offset = datetime.timedelta(minutes=stdoffset)
self._dst_offset = datetime.timedelta(minutes=dstoffset)
# for the meaning see the win32 TIME_ZONE_INFORMATION structure docs
# http://msdn.microsoft.com/en-us/library/windows/desktop/ms725481(v=vs.85).aspx
(self._stdmonth,
self._stddayofweek, # Sunday = 0
self._stdweeknumber, # Last = 5
self._stdhour,
self._stdminute) = tup[4:9]
(self._dstmonth,
self._dstdayofweek, # Sunday = 0
self._dstweeknumber, # Last = 5
self._dsthour,
self._dstminute) = tup[12:17]
self._dst_base_offset_ = self._dst_offset - self._std_offset
self.hasdst = self._get_hasdst()
def __repr__(self):
return "tzwin(%s)" % repr(self._name)
def __reduce__(self):
return (self.__class__, (self._name,))
class tzwinlocal(tzwinbase):
"""
Class representing the local time zone information in the Windows registry
While :class:`dateutil.tz.tzlocal` makes system calls (via the :mod:`time`
module) to retrieve time zone information, ``tzwinlocal`` retrieves the
rules directly from the Windows registry and creates an object like
:class:`dateutil.tz.tzwin`.
Because Windows does not have an equivalent of :func:`time.tzset`, on
Windows, :class:`dateutil.tz.tzlocal` instances will always reflect the
time zone settings *at the time that the process was started*, meaning
changes to the machine's time zone settings during the run of a program
on Windows will **not** be reflected by :class:`dateutil.tz.tzlocal`.
Because ``tzwinlocal`` reads the registry directly, it is unaffected by
this issue.
"""
def __init__(self):
with winreg.ConnectRegistry(None, winreg.HKEY_LOCAL_MACHINE) as handle:
with winreg.OpenKey(handle, TZLOCALKEYNAME) as tzlocalkey:
keydict = valuestodict(tzlocalkey)
self._std_abbr = keydict["StandardName"]
self._dst_abbr = keydict["DaylightName"]
try:
tzkeyname = text_type('{kn}\\{sn}').format(kn=TZKEYNAME,
sn=self._std_abbr)
with winreg.OpenKey(handle, tzkeyname) as tzkey:
_keydict = valuestodict(tzkey)
self._display = _keydict["Display"]
except OSError:
self._display = None
stdoffset = -keydict["Bias"]-keydict["StandardBias"]
dstoffset = stdoffset-keydict["DaylightBias"]
self._std_offset = datetime.timedelta(minutes=stdoffset)
self._dst_offset = datetime.timedelta(minutes=dstoffset)
# For reasons unclear, in this particular key, the day of week has been
# moved to the END of the SYSTEMTIME structure.
tup = struct.unpack("=8h", keydict["StandardStart"])
(self._stdmonth,
self._stdweeknumber, # Last = 5
self._stdhour,
self._stdminute) = tup[1:5]
self._stddayofweek = tup[7]
tup = struct.unpack("=8h", keydict["DaylightStart"])
(self._dstmonth,
self._dstweeknumber, # Last = 5
self._dsthour,
self._dstminute) = tup[1:5]
self._dstdayofweek = tup[7]
self._dst_base_offset_ = self._dst_offset - self._std_offset
self.hasdst = self._get_hasdst()
def __repr__(self):
return "tzwinlocal()"
def __str__(self):
# str will return the standard name, not the daylight name.
return "tzwinlocal(%s)" % repr(self._std_abbr)
def __reduce__(self):
return (self.__class__, ())
def picknthweekday(year, month, dayofweek, hour, minute, whichweek):
""" dayofweek == 0 means Sunday, whichweek 5 means last instance """
first = datetime.datetime(year, month, 1, hour, minute)
# This will work if dayofweek is ISO weekday (1-7) or Microsoft-style (0-6),
# Because 7 % 7 = 0
weekdayone = first.replace(day=((dayofweek - first.isoweekday()) % 7) + 1)
wd = weekdayone + ((whichweek - 1) * ONEWEEK)
if (wd.month != month):
wd -= ONEWEEK
return wd
def valuestodict(key):
"""Convert a registry key's values to a dictionary."""
dout = {}
size = winreg.QueryInfoKey(key)[1]
tz_res = None
for i in range(size):
key_name, value, dtype = winreg.EnumValue(key, i)
if dtype == winreg.REG_DWORD or dtype == winreg.REG_DWORD_LITTLE_ENDIAN:
# If it's a DWORD (32-bit integer), it's stored as unsigned - convert
# that to a proper signed integer
if value & (1 << 31):
value = value - (1 << 32)
elif dtype == winreg.REG_SZ:
# If it's a reference to the tzres DLL, load the actual string
if value.startswith('@tzres'):
tz_res = tz_res or tzres()
value = tz_res.name_from_string(value)
value = value.rstrip('\x00') # Remove trailing nulls
dout[key_name] = value
return dout

View File

@ -0,0 +1,2 @@
# tzwin has moved to dateutil.tz.win
from .tz.win import *

View File

@ -0,0 +1,71 @@
# -*- coding: utf-8 -*-
"""
This module offers general convenience and utility functions for dealing with
datetimes.
.. versionadded:: 2.7.0
"""
from __future__ import unicode_literals
from datetime import datetime, time
def today(tzinfo=None):
"""
Returns a :py:class:`datetime` representing the current day at midnight
:param tzinfo:
The time zone to attach (also used to determine the current day).
:return:
A :py:class:`datetime.datetime` object representing the current day
at midnight.
"""
dt = datetime.now(tzinfo)
return datetime.combine(dt.date(), time(0, tzinfo=tzinfo))
def default_tzinfo(dt, tzinfo):
"""
Sets the ``tzinfo`` parameter on naive datetimes only
This is useful for example when you are provided a datetime that may have
either an implicit or explicit time zone, such as when parsing a time zone
string.
.. doctest::
>>> from dateutil.tz import tzoffset
>>> from dateutil.parser import parse
>>> from dateutil.utils import default_tzinfo
>>> dflt_tz = tzoffset("EST", -18000)
>>> print(default_tzinfo(parse('2014-01-01 12:30 UTC'), dflt_tz))
2014-01-01 12:30:00+00:00
>>> print(default_tzinfo(parse('2014-01-01 12:30'), dflt_tz))
2014-01-01 12:30:00-05:00
:param dt:
The datetime on which to replace the time zone
:param tzinfo:
The :py:class:`datetime.tzinfo` subclass instance to assign to
``dt`` if (and only if) it is naive.
:return:
Returns an aware :py:class:`datetime.datetime`.
"""
if dt.tzinfo is not None:
return dt
else:
return dt.replace(tzinfo=tzinfo)
def within_delta(dt1, dt2, delta):
"""
Useful for comparing two datetimes that may have a negligible difference
to be considered equal.
"""
delta = abs(delta)
difference = dt1 - dt2
return -delta <= difference <= delta

View File

@ -0,0 +1,167 @@
# -*- coding: utf-8 -*-
import warnings
import json
from tarfile import TarFile
from pkgutil import get_data
from io import BytesIO
from dateutil.tz import tzfile as _tzfile
__all__ = ["get_zonefile_instance", "gettz", "gettz_db_metadata"]
ZONEFILENAME = "dateutil-zoneinfo.tar.gz"
METADATA_FN = 'METADATA'
class tzfile(_tzfile):
def __reduce__(self):
return (gettz, (self._filename,))
def getzoneinfofile_stream():
try:
return BytesIO(get_data(__name__, ZONEFILENAME))
except IOError as e: # TODO switch to FileNotFoundError?
warnings.warn("I/O error({0}): {1}".format(e.errno, e.strerror))
return None
class ZoneInfoFile(object):
def __init__(self, zonefile_stream=None):
if zonefile_stream is not None:
with TarFile.open(fileobj=zonefile_stream) as tf:
self.zones = {zf.name: tzfile(tf.extractfile(zf), filename=zf.name)
for zf in tf.getmembers()
if zf.isfile() and zf.name != METADATA_FN}
# deal with links: They'll point to their parent object. Less
# waste of memory
links = {zl.name: self.zones[zl.linkname]
for zl in tf.getmembers() if
zl.islnk() or zl.issym()}
self.zones.update(links)
try:
metadata_json = tf.extractfile(tf.getmember(METADATA_FN))
metadata_str = metadata_json.read().decode('UTF-8')
self.metadata = json.loads(metadata_str)
except KeyError:
# no metadata in tar file
self.metadata = None
else:
self.zones = {}
self.metadata = None
def get(self, name, default=None):
"""
Wrapper for :func:`ZoneInfoFile.zones.get`. This is a convenience method
for retrieving zones from the zone dictionary.
:param name:
The name of the zone to retrieve. (Generally IANA zone names)
:param default:
The value to return in the event of a missing key.
.. versionadded:: 2.6.0
"""
return self.zones.get(name, default)
# The current API has gettz as a module function, although in fact it taps into
# a stateful class. So as a workaround for now, without changing the API, we
# will create a new "global" class instance the first time a user requests a
# timezone. Ugly, but adheres to the api.
#
# TODO: Remove after deprecation period.
_CLASS_ZONE_INSTANCE = []
def get_zonefile_instance(new_instance=False):
"""
This is a convenience function which provides a :class:`ZoneInfoFile`
instance using the data provided by the ``dateutil`` package. By default, it
caches a single instance of the ZoneInfoFile object and returns that.
:param new_instance:
If ``True``, a new instance of :class:`ZoneInfoFile` is instantiated and
used as the cached instance for the next call. Otherwise, new instances
are created only as necessary.
:return:
Returns a :class:`ZoneInfoFile` object.
.. versionadded:: 2.6
"""
if new_instance:
zif = None
else:
zif = getattr(get_zonefile_instance, '_cached_instance', None)
if zif is None:
zif = ZoneInfoFile(getzoneinfofile_stream())
get_zonefile_instance._cached_instance = zif
return zif
def gettz(name):
"""
This retrieves a time zone from the local zoneinfo tarball that is packaged
with dateutil.
:param name:
An IANA-style time zone name, as found in the zoneinfo file.
:return:
Returns a :class:`dateutil.tz.tzfile` time zone object.
.. warning::
It is generally inadvisable to use this function, and it is only
provided for API compatibility with earlier versions. This is *not*
equivalent to ``dateutil.tz.gettz()``, which selects an appropriate
time zone based on the inputs, favoring system zoneinfo. This is ONLY
for accessing the dateutil-specific zoneinfo (which may be out of
date compared to the system zoneinfo).
.. deprecated:: 2.6
If you need to use a specific zoneinfofile over the system zoneinfo,
instantiate a :class:`dateutil.zoneinfo.ZoneInfoFile` object and call
:func:`dateutil.zoneinfo.ZoneInfoFile.get(name)` instead.
Use :func:`get_zonefile_instance` to retrieve an instance of the
dateutil-provided zoneinfo.
"""
warnings.warn("zoneinfo.gettz() will be removed in future versions, "
"to use the dateutil-provided zoneinfo files, instantiate a "
"ZoneInfoFile object and use ZoneInfoFile.zones.get() "
"instead. See the documentation for details.",
DeprecationWarning)
if len(_CLASS_ZONE_INSTANCE) == 0:
_CLASS_ZONE_INSTANCE.append(ZoneInfoFile(getzoneinfofile_stream()))
return _CLASS_ZONE_INSTANCE[0].zones.get(name)
def gettz_db_metadata():
""" Get the zonefile metadata
See `zonefile_metadata`_
:returns:
A dictionary with the database metadata
.. deprecated:: 2.6
See deprecation warning in :func:`zoneinfo.gettz`. To get metadata,
query the attribute ``zoneinfo.ZoneInfoFile.metadata``.
"""
warnings.warn("zoneinfo.gettz_db_metadata() will be removed in future "
"versions, to use the dateutil-provided zoneinfo files, "
"ZoneInfoFile object and query the 'metadata' attribute "
"instead. See the documentation for details.",
DeprecationWarning)
if len(_CLASS_ZONE_INSTANCE) == 0:
_CLASS_ZONE_INSTANCE.append(ZoneInfoFile(getzoneinfofile_stream()))
return _CLASS_ZONE_INSTANCE[0].metadata

View File

@ -0,0 +1,75 @@
import logging
import os
import tempfile
import shutil
import json
from subprocess import check_call, check_output
from tarfile import TarFile
from dateutil.zoneinfo import METADATA_FN, ZONEFILENAME
def rebuild(filename, tag=None, format="gz", zonegroups=[], metadata=None):
"""Rebuild the internal timezone info in dateutil/zoneinfo/zoneinfo*tar*
filename is the timezone tarball from ``ftp.iana.org/tz``.
"""
tmpdir = tempfile.mkdtemp()
zonedir = os.path.join(tmpdir, "zoneinfo")
moduledir = os.path.dirname(__file__)
try:
with TarFile.open(filename) as tf:
for name in zonegroups:
tf.extract(name, tmpdir)
filepaths = [os.path.join(tmpdir, n) for n in zonegroups]
_run_zic(zonedir, filepaths)
# write metadata file
with open(os.path.join(zonedir, METADATA_FN), 'w') as f:
json.dump(metadata, f, indent=4, sort_keys=True)
target = os.path.join(moduledir, ZONEFILENAME)
with TarFile.open(target, "w:%s" % format) as tf:
for entry in os.listdir(zonedir):
entrypath = os.path.join(zonedir, entry)
tf.add(entrypath, entry)
finally:
shutil.rmtree(tmpdir)
def _run_zic(zonedir, filepaths):
"""Calls the ``zic`` compiler in a compatible way to get a "fat" binary.
Recent versions of ``zic`` default to ``-b slim``, while older versions
don't even have the ``-b`` option (but default to "fat" binaries). The
current version of dateutil does not support Version 2+ TZif files, which
causes problems when used in conjunction with "slim" binaries, so this
function is used to ensure that we always get a "fat" binary.
"""
try:
help_text = check_output(["zic", "--help"])
except OSError as e:
_print_on_nosuchfile(e)
raise
if b"-b " in help_text:
bloat_args = ["-b", "fat"]
else:
bloat_args = []
check_call(["zic"] + bloat_args + ["-d", zonedir] + filepaths)
def _print_on_nosuchfile(e):
"""Print helpful troubleshooting message
e is an exception raised by subprocess.check_call()
"""
if e.errno == 2:
logging.error(
"Could not find zic. Perhaps you need to install "
"libc-bin or some other package that provides it, "
"or it's not in your PATH?")

View File

@ -0,0 +1,201 @@
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright [2020] [Paul Davis <paul.joseph.davis@gmail.com>]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

View File

@ -0,0 +1,154 @@
Metadata-Version: 2.1
Name: ghp-import
Version: 2.1.0
Summary: Copy your docs directly to the gh-pages branch.
Home-page: https://github.com/c-w/ghp-import
Author: Paul Joseph Davis
Author-email: paul.joseph.davis@gmail.com
License: Apache Software License
Platform: UNKNOWN
Classifier: Development Status :: 5 - Production/Stable
Classifier: License :: OSI Approved :: Apache Software License
Classifier: Intended Audience :: Developers
Classifier: Natural Language :: English
Classifier: Operating System :: OS Independent
Classifier: Programming Language :: Python
Classifier: Programming Language :: Python :: 2
Classifier: Programming Language :: Python :: 3
Description-Content-Type: text/markdown
License-File: LICENSE
Requires-Dist: python-dateutil (>=2.8.1)
Provides-Extra: dev
Requires-Dist: twine ; extra == 'dev'
Requires-Dist: markdown ; extra == 'dev'
Requires-Dist: flake8 ; extra == 'dev'
Requires-Dist: wheel ; extra == 'dev'
GitHub Pages Import
===================
[![CI status](https://github.com/davisp/ghp-import/workflows/CI/badge.svg)](https://github.com/davisp/ghp-import/actions?query=workflow%3Aci)
[![CircleCI](https://circleci.com/gh/c-w/ghp-import/tree/master.svg?style=svg)](https://circleci.com/gh/c-w/ghp-import/tree/master)
[![TravisCI](https://travis-ci.org/c-w/ghp-import.svg?branch=master)](https://travis-ci.org/c-w/ghp-import)
[![License](https://img.shields.io/badge/License-Apache%202.0-green.svg)](https://opensource.org/licenses/Apache-2.0)
[![Version](https://img.shields.io/pypi/v/ghp-import.svg)](https://pypi.org/project/ghp-import/)
As part of [gunicorn][gunicorn], [Benoit Chesneau][benoit] and [Paul Davis][davisp]
were looking at how to host documentation. There's the obvious method of
using [GitHub's post-receive hook][github-post] to trigger doc builds and rsync
to a webserver, but we ended up wanting to try out github's hosting to make the
whole interface a bit more robust.
[GitHub Pages][gh-pages] is a pretty awesome service that GitHub provides for
hosting project documentation. The only thing is that it requires a
`gh-pages` branch that is the site's document root. This means that keeping
documentation sources in the branch with code is a bit difficult. And it really
turns into a head scratcher for things like [Sphinx][sphinx] that want to
access documentation sources and code sources at the same time.
Then we stumbled across an interesting looking package called
[github-tools][github-tools] that looked almost like what we wanted. It was a tad
complicated and more involved than we wanted but it gave us an idea. Why not
just write a script that can copy a directory to the `gh-pages` branch of the
repository. This saves us from even having to think about the branch and
everything becomes magical.
This is what `ghp-import` was written for.
[gunicorn]: http://www.gunicorn.com/ "Gunicorn"
[benoit]: http://github.com/benoitc "Benoît Chesneau"
[davisp]: http://github.com/davisp "Paul J. Davis"
[github-post]: https://help.github.com/articles/post-receive-hooks "GitHub Post-Receive Hook"
[gh-pages]: http://pages.github.com/ "GitHub Pages"
[sphinx]: http://sphinx.pocoo.org/ "Sphinx Documentation"
[github-tools]: http://dinoboff.github.com/github-tools/ "github-tools"
Big Fat Warning
---------------
This will **DESTROY** your `gh-pages` branch. If you love it, you'll want to
take backups before playing with this. This script assumes that `gh-pages` is
100% derivative. You should never edit files in your `gh-pages` branch by hand
if you're using this script because you will lose your work.
When used with a prefix, only files below the set prefix will be destroyed, limiting the
above warning to just that directory and everything below it.
Usage
-----
```
Usage: ghp-import [OPTIONS] DIRECTORY
Options:
-n, --no-jekyll Include a .nojekyll file in the branch.
-c CNAME, --cname=CNAME
Write a CNAME file with the given CNAME.
-m MESG, --message=MESG
The commit message to use on the target branch.
-p, --push Push the branch to origin/{branch} after committing.
-x PREFIX, --prefix=PREFIX
The prefix to add to each file that gets pushed to the
remote. Only files below this prefix will be cleared
out. [none]
-f, --force Force the push to the repository.
-o, --no-history Force new commit without parent history.
-r REMOTE, --remote=REMOTE
The name of the remote to push to. [origin]
-b BRANCH, --branch=BRANCH
Name of the branch to write to. [gh-pages]
-s, --shell Use the shell when invoking Git. [False]
-l, --follow-links Follow symlinks when adding files. [False]
-h, --help show this help message and exit
```
Its pretty simple. Inside your repository just run `ghp-import $DOCS_DIR`
where `$DOCS_DIR` is the path to the **built** documentation. This will write a
commit to your `gh-pages` branch with the current documents in it.
If you specify `-p` it will also attempt to push the `gh-pages` branch to
GitHub. By default it'll just run `git push origin gh-pages`. You can specify
a different remote using the `-r` flag.
The `-o` option will discard any previous history and ensure that only a
single commit is always pushed to the `gh-pages` branch. This is useful to
avoid bloating the repository size and is **highly recommended**.
You can specify a different branch with `-b`. This is useful for user and
organization page, which are served from the `master` branch.
Some Windows users report needing to pass Git commands through the shell which can be accomplished by passing `-s`.
The `-l` option will cause the import to follow symlinks for users that have odd configurations that include symlinking outside of their documentation directory.
Python Usage
------------
You can also call ghp_import directly from your Python code as a library. The
library has one public function `ghp_import.ghp_import`, which accepts the
following arguments:
* `srcdir`: The path to the **built** documentation (required).
* `remote`: The name of the remote to push to. Default: `origin`.
* `branch`: Name of the branch to write to. Default: `gh-pages`.
* `mesg`: The commit message to use on the target branch. Default: `Update documentation`.
* `push`: Push the branch to {remote}/{branch} after committing. Default: `False`.
* `prefix`: The prefix to add to each file that gets pushed to the remote. Default: `None`.
* `force`: Force the push to the repository. Default: `False`.
* `no_history`: Force new commit without parent history. Default: `False`.
* `use_shell`: Default: Use the shell when invoking Git. `False`.
* `followlinks`: Follow symlinks when adding files. Default: `False`.
* `cname`: Write a CNAME file with the given CNAME. Default: `None`.
* `nojekyll`: Include a .nojekyll file in the branch. Default: `False`.
With Python's current working directory (cwd) inside your repository, do the
following:
```python
from ghp_import import ghp_import
ghp_import('docs', push=True, cname='example.com')
```

View File

@ -0,0 +1,10 @@
../../../bin/ghp-import,sha256=EkLD9La0ChhbncgMfGa6pTztBlH6VnVGXt7GS1dRghA,255
__pycache__/ghp_import.cpython-311.pyc,,
ghp_import-2.1.0.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
ghp_import-2.1.0.dist-info/LICENSE,sha256=C8j_tF8m7dHNDeT1BCWuLLRsWMbYrBE5hNQSC-NVr6k,11374
ghp_import-2.1.0.dist-info/METADATA,sha256=PCrYmDTJ2XjIuUkYM33d1t8Fva95SG2UphvN-t9b6y8,7177
ghp_import-2.1.0.dist-info/RECORD,,
ghp_import-2.1.0.dist-info/WHEEL,sha256=G16H4A3IeoQmnOrYV4ueZGKSjhipXx8zc8nu9FGlvMA,92
ghp_import-2.1.0.dist-info/entry_points.txt,sha256=mk55YA2cS0KmQK9APJFk_1ny9zFWFOZDtOBZMO0cu1Y,48
ghp_import-2.1.0.dist-info/top_level.txt,sha256=QGVcxjaCFAMEV3ZX7ADAlIMIlsiyfplHNQi-JwrTgow,11
ghp_import.py,sha256=zvDcFrdka_GzgEkD1BjJrBfDHD3sCExSbnJHmBE1igU,9234

View File

@ -0,0 +1,5 @@
Wheel-Version: 1.0
Generator: bdist_wheel (0.37.1)
Root-Is-Purelib: true
Tag: py3-none-any

View File

@ -0,0 +1,3 @@
[console_scripts]
ghp-import = ghp_import:main

View File

@ -0,0 +1 @@
ghp_import

View File

@ -0,0 +1,306 @@
#! /usr/bin/env python
import errno
import os
import subprocess as sp
import sys
import time
from dateutil import tz
from datetime import datetime
try:
from shlex import quote
except ImportError:
from pipes import quote
__all__ = ['ghp_import']
__version__ = "2.1.0"
class GhpError(Exception):
def __init__(self, message):
self.message = message
if sys.version_info[0] == 3:
def enc(text):
if isinstance(text, bytes):
return text
return text.encode()
def dec(text):
if isinstance(text, bytes):
return text.decode('utf-8')
return text
def write(pipe, data):
try:
pipe.stdin.write(data)
except IOError as e:
if e.errno != errno.EPIPE:
raise
else:
def enc(text):
if isinstance(text, unicode): # noqa F821
return text.encode('utf-8')
return text
def dec(text):
if isinstance(text, unicode): # noqa F821
return text
return text.decode('utf-8')
def write(pipe, data):
pipe.stdin.write(data)
class Git(object):
def __init__(self, use_shell=False):
self.use_shell = use_shell
self.cmd = None
self.pipe = None
self.stderr = None
self.stdout = None
def check_repo(self):
if self.call('rev-parse') != 0:
error = self.stderr
if not error:
error = "Unknown Git error"
error = dec(error)
if error.startswith("fatal: "):
error = error[len("fatal: "):]
raise GhpError(error)
def try_rebase(self, remote, branch, no_history=False):
rc = self.call('rev-list', '--max-count=1', '%s/%s' % (remote, branch))
if rc != 0:
return True
rev = dec(self.stdout.strip())
if no_history:
rc = self.call('update-ref', '-d', 'refs/heads/%s' % branch)
else:
rc = self.call('update-ref', 'refs/heads/%s' % branch, rev)
if rc != 0:
return False
return True
def get_config(self, key):
self.call('config', key)
return self.stdout.strip()
def get_prev_commit(self, branch):
rc = self.call('rev-list', '--max-count=1', branch, '--')
if rc != 0:
return None
return dec(self.stdout).strip()
def open(self, *args, **kwargs):
if self.use_shell:
self.cmd = 'git ' + ' '.join(map(quote, args))
else:
self.cmd = ['git'] + list(args)
if sys.version_info >= (3, 2, 0):
kwargs['universal_newlines'] = False
for k in 'stdin stdout stderr'.split():
kwargs.setdefault(k, sp.PIPE)
kwargs['shell'] = self.use_shell
self.pipe = sp.Popen(self.cmd, **kwargs)
return self.pipe
def call(self, *args, **kwargs):
self.open(*args, **kwargs)
(self.stdout, self.stderr) = self.pipe.communicate()
return self.pipe.wait()
def check_call(self, *args, **kwargs):
kwargs["shell"] = self.use_shell
sp.check_call(['git'] + list(args), **kwargs)
def mk_when(timestamp=None):
if timestamp is None:
timestamp = int(time.time())
currtz = datetime.now(tz.tzlocal()).strftime('%z')
return "%s %s" % (timestamp, currtz)
def start_commit(pipe, git, branch, message, prefix=None):
uname = os.getenv('GIT_COMMITTER_NAME', dec(git.get_config('user.name')))
email = os.getenv('GIT_COMMITTER_EMAIL', dec(git.get_config('user.email')))
when = os.getenv('GIT_COMMITTER_DATE', mk_when())
write(pipe, enc('commit refs/heads/%s\n' % branch))
write(pipe, enc('committer %s <%s> %s\n' % (uname, email, when)))
write(pipe, enc('data %d\n%s\n' % (len(enc(message)), message)))
head = git.get_prev_commit(branch)
if head:
write(pipe, enc('from %s\n' % head))
if prefix:
write(pipe, enc('D %s\n' % prefix))
else:
write(pipe, enc('deleteall\n'))
def add_file(pipe, srcpath, tgtpath):
with open(srcpath, "rb") as handle:
if os.access(srcpath, os.X_OK):
write(pipe, enc('M 100755 inline %s\n' % tgtpath))
else:
write(pipe, enc('M 100644 inline %s\n' % tgtpath))
data = handle.read()
write(pipe, enc('data %d\n' % len(data)))
write(pipe, enc(data))
write(pipe, enc('\n'))
def add_nojekyll(pipe, prefix=None):
if prefix:
fpath = os.path.join(prefix, '.nojekyll')
else:
fpath = '.nojekyll'
write(pipe, enc('M 100644 inline %s\n' % fpath))
write(pipe, enc('data 0\n'))
write(pipe, enc('\n'))
def add_cname(pipe, cname):
write(pipe, enc('M 100644 inline CNAME\n'))
write(pipe, enc('data %d\n%s\n' % (len(enc(cname)), cname)))
def gitpath(fname):
norm = os.path.normpath(fname)
return "/".join(norm.split(os.path.sep))
def run_import(git, srcdir, **opts):
srcdir = dec(srcdir)
pipe = git.open('fast-import', '--date-format=rfc2822', '--quiet',
stdin=sp.PIPE, stdout=None, stderr=None)
start_commit(pipe, git, opts['branch'], opts['mesg'], opts['prefix'])
for path, _, fnames in os.walk(srcdir, followlinks=opts['followlinks']):
for fn in fnames:
fpath = os.path.join(path, fn)
gpath = gitpath(os.path.relpath(fpath, start=srcdir))
if opts['prefix']:
gpath = os.path.join(opts['prefix'], gpath)
add_file(pipe, fpath, gpath)
if opts['nojekyll']:
add_nojekyll(pipe, opts['prefix'])
if opts['cname'] is not None:
add_cname(pipe, opts['cname'])
write(pipe, enc('\n'))
pipe.stdin.close()
if pipe.wait() != 0:
sys.stdout.write(enc("Failed to process commit.\n"))
def options():
return [
(('-n', '--no-jekyll'), dict(
dest='nojekyll',
default=False,
action="store_true",
help='Include a .nojekyll file in the branch.',
)),
(('-c', '--cname'), dict(
dest='cname',
default=None,
help='Write a CNAME file with the given CNAME.',
)),
(('-m', '--message'), dict(
dest='mesg',
default='Update documentation',
help='The commit message to use on the target branch.',
)),
(('-p', '--push'), dict(
dest='push',
default=False,
action='store_true',
help='Push the branch to origin/{branch} after committing.',
)),
(('-x', '--prefix'), dict(
dest='prefix',
default=None,
help='The prefix to add to each file that gets pushed to the '
'remote. Only files below this prefix will be cleared '
'out. [%(default)s]',
)),
(('-f', '--force'), dict(
dest='force',
default=False, action='store_true',
help='Force the push to the repository.',
)),
(('-o', '--no-history'), dict(
dest='no_history',
default=False,
action='store_true',
help='Force new commit without parent history.',
)),
(('-r', '--remote'), dict(
dest='remote',
default='origin',
help='The name of the remote to push to. [%(default)s]',
)),
(('-b', '--branch'), dict(
dest='branch',
default='gh-pages',
help='Name of the branch to write to. [%(default)s]',
)),
(('-s', '--shell'), dict(
dest='use_shell',
default=False,
action='store_true',
help='Use the shell when invoking Git. [%(default)s]',
)),
(('-l', '--follow-links'), dict(
dest='followlinks',
default=False,
action='store_true',
help='Follow symlinks when adding files. [%(default)s]',
))
]
def ghp_import(srcdir, **kwargs):
if not os.path.isdir(srcdir):
raise GhpError("Not a directory: %s" % srcdir)
opts = {kwargs["dest"]: kwargs["default"] for _, kwargs in options()}
opts.update(kwargs)
git = Git(use_shell=opts['use_shell'])
git.check_repo()
if not git.try_rebase(opts['remote'], opts['branch'], opts['no_history']):
raise GhpError("Failed to rebase %s branch." % opts['branch'])
run_import(git, srcdir, **opts)
if opts['push']:
if opts['force'] or opts['no_history']:
git.check_call('push', opts['remote'], opts['branch'], '--force')
else:
git.check_call('push', opts['remote'], opts['branch'])
def main():
from argparse import ArgumentParser
parser = ArgumentParser()
parser.add_argument("--version", action="version", version=__version__)
parser.add_argument("directory")
for args, kwargs in options():
parser.add_argument(*args, **kwargs)
args = parser.parse_args().__dict__
try:
ghp_import(args.pop("directory"), **args)
except GhpError as e:
parser.error(e.message)
if __name__ == '__main__':
main()

View File

@ -0,0 +1,61 @@
"""
Python Markdown
A Python implementation of John Gruber's Markdown.
Documentation: https://python-markdown.github.io/
GitHub: https://github.com/Python-Markdown/markdown/
PyPI: https://pypi.org/project/Markdown/
Started by Manfred Stienstra (http://www.dwerg.net/).
Maintained for a few years by Yuri Takhteyev (http://www.freewisdom.org).
Currently maintained by Waylan Limberg (https://github.com/waylan),
Dmitry Shachnev (https://github.com/mitya57) and Isaac Muse (https://github.com/facelessuser).
Copyright 2007-2018 The Python Markdown Project (v. 1.7 and later)
Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b)
Copyright 2004 Manfred Stienstra (the original version)
License: BSD (see LICENSE.md for details).
"""
import sys
# TODO: Remove this check at some point in the future.
# (also remove flake8's 'ignore E402' comments below)
if sys.version_info[0] < 3: # pragma: no cover
raise ImportError('A recent version of Python 3 is required.')
from .core import Markdown, markdown, markdownFromFile # noqa: E402
from .util import PY37 # noqa: E402
from .pep562 import Pep562 # noqa: E402
from .__meta__ import __version__, __version_info__ # noqa: E402
import warnings # noqa: E402
# For backward compatibility as some extensions expect it...
from .extensions import Extension # noqa
__all__ = ['Markdown', 'markdown', 'markdownFromFile']
__deprecated__ = {
"version": ("__version__", __version__),
"version_info": ("__version_info__", __version_info__)
}
def __getattr__(name):
"""Get attribute."""
deprecated = __deprecated__.get(name)
if deprecated:
warnings.warn(
"'{}' is deprecated. Use '{}' instead.".format(name, deprecated[0]),
category=DeprecationWarning,
stacklevel=(3 if PY37 else 4)
)
return deprecated[1]
raise AttributeError("module '{}' has no attribute '{}'".format(__name__, name))
if not PY37:
Pep562(__name__)

View File

@ -0,0 +1,151 @@
"""
Python Markdown
A Python implementation of John Gruber's Markdown.
Documentation: https://python-markdown.github.io/
GitHub: https://github.com/Python-Markdown/markdown/
PyPI: https://pypi.org/project/Markdown/
Started by Manfred Stienstra (http://www.dwerg.net/).
Maintained for a few years by Yuri Takhteyev (http://www.freewisdom.org).
Currently maintained by Waylan Limberg (https://github.com/waylan),
Dmitry Shachnev (https://github.com/mitya57) and Isaac Muse (https://github.com/facelessuser).
Copyright 2007-2018 The Python Markdown Project (v. 1.7 and later)
Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b)
Copyright 2004 Manfred Stienstra (the original version)
License: BSD (see LICENSE.md for details).
"""
import sys
import optparse
import codecs
import warnings
import markdown
try:
# We use `unsafe_load` because users may need to pass in actual Python
# objects. As this is only available from the CLI, the user has much
# worse problems if an attacker can use this as an attach vector.
from yaml import unsafe_load as yaml_load
except ImportError: # pragma: no cover
try:
# Fall back to PyYAML <5.1
from yaml import load as yaml_load
except ImportError:
# Fall back to JSON
from json import load as yaml_load
import logging
from logging import DEBUG, WARNING, CRITICAL
logger = logging.getLogger('MARKDOWN')
def parse_options(args=None, values=None):
"""
Define and parse `optparse` options for command-line usage.
"""
usage = """%prog [options] [INPUTFILE]
(STDIN is assumed if no INPUTFILE is given)"""
desc = "A Python implementation of John Gruber's Markdown. " \
"https://Python-Markdown.github.io/"
ver = "%%prog %s" % markdown.__version__
parser = optparse.OptionParser(usage=usage, description=desc, version=ver)
parser.add_option("-f", "--file", dest="filename", default=None,
help="Write output to OUTPUT_FILE. Defaults to STDOUT.",
metavar="OUTPUT_FILE")
parser.add_option("-e", "--encoding", dest="encoding",
help="Encoding for input and output files.",)
parser.add_option("-o", "--output_format", dest="output_format",
default='xhtml', metavar="OUTPUT_FORMAT",
help="Use output format 'xhtml' (default) or 'html'.")
parser.add_option("-n", "--no_lazy_ol", dest="lazy_ol",
action='store_false', default=True,
help="Observe number of first item of ordered lists.")
parser.add_option("-x", "--extension", action="append", dest="extensions",
help="Load extension EXTENSION.", metavar="EXTENSION")
parser.add_option("-c", "--extension_configs",
dest="configfile", default=None,
help="Read extension configurations from CONFIG_FILE. "
"CONFIG_FILE must be of JSON or YAML format. YAML "
"format requires that a python YAML library be "
"installed. The parsed JSON or YAML must result in a "
"python dictionary which would be accepted by the "
"'extension_configs' keyword on the markdown.Markdown "
"class. The extensions must also be loaded with the "
"`--extension` option.",
metavar="CONFIG_FILE")
parser.add_option("-q", "--quiet", default=CRITICAL,
action="store_const", const=CRITICAL+10, dest="verbose",
help="Suppress all warnings.")
parser.add_option("-v", "--verbose",
action="store_const", const=WARNING, dest="verbose",
help="Print all warnings.")
parser.add_option("--noisy",
action="store_const", const=DEBUG, dest="verbose",
help="Print debug messages.")
(options, args) = parser.parse_args(args, values)
if len(args) == 0:
input_file = None
else:
input_file = args[0]
if not options.extensions:
options.extensions = []
extension_configs = {}
if options.configfile:
with codecs.open(
options.configfile, mode="r", encoding=options.encoding
) as fp:
try:
extension_configs = yaml_load(fp)
except Exception as e:
message = "Failed parsing extension config file: %s" % \
options.configfile
e.args = (message,) + e.args[1:]
raise
opts = {
'input': input_file,
'output': options.filename,
'extensions': options.extensions,
'extension_configs': extension_configs,
'encoding': options.encoding,
'output_format': options.output_format,
'lazy_ol': options.lazy_ol
}
return opts, options.verbose
def run(): # pragma: no cover
"""Run Markdown from the command line."""
# Parse options and adjust logging level if necessary
options, logging_level = parse_options()
if not options:
sys.exit(2)
logger.setLevel(logging_level)
console_handler = logging.StreamHandler()
logger.addHandler(console_handler)
if logging_level <= WARNING:
# Ensure deprecation warnings get displayed
warnings.filterwarnings('default')
logging.captureWarnings(True)
warn_logger = logging.getLogger('py.warnings')
warn_logger.addHandler(console_handler)
# Run
markdown.markdownFromFile(**options)
if __name__ == '__main__': # pragma: no cover
# Support running module as a commandline command.
# `python -m markdown [options] [args]`.
run()

View File

@ -0,0 +1,49 @@
"""
Python Markdown
A Python implementation of John Gruber's Markdown.
Documentation: https://python-markdown.github.io/
GitHub: https://github.com/Python-Markdown/markdown/
PyPI: https://pypi.org/project/Markdown/
Started by Manfred Stienstra (http://www.dwerg.net/).
Maintained for a few years by Yuri Takhteyev (http://www.freewisdom.org).
Currently maintained by Waylan Limberg (https://github.com/waylan),
Dmitry Shachnev (https://github.com/mitya57) and Isaac Muse (https://github.com/facelessuser).
Copyright 2007-2018 The Python Markdown Project (v. 1.7 and later)
Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b)
Copyright 2004 Manfred Stienstra (the original version)
License: BSD (see LICENSE.md for details).
"""
# __version_info__ format:
# (major, minor, patch, dev/alpha/beta/rc/final, #)
# (1, 1, 2, 'dev', 0) => "1.1.2.dev0"
# (1, 1, 2, 'alpha', 1) => "1.1.2a1"
# (1, 2, 0, 'beta', 2) => "1.2b2"
# (1, 2, 0, 'rc', 4) => "1.2rc4"
# (1, 2, 0, 'final', 0) => "1.2"
__version_info__ = (3, 3, 7, 'final', 0)
def _get_version(version_info):
" Returns a PEP 440-compliant version number from version_info. "
assert len(version_info) == 5
assert version_info[3] in ('dev', 'alpha', 'beta', 'rc', 'final')
parts = 2 if version_info[2] == 0 else 3
v = '.'.join(map(str, version_info[:parts]))
if version_info[3] == 'dev':
v += '.dev' + str(version_info[4])
elif version_info[3] != 'final':
mapping = {'alpha': 'a', 'beta': 'b', 'rc': 'rc'}
v += mapping[version_info[3]] + str(version_info[4])
return v
__version__ = _get_version(__version_info__)

Some files were not shown because too many files have changed in this diff Show More