medialab · Yomguithereal · Oct 3, 2024 · Oct 2, 2024
diff --git a/docs/cli.md b/docs/cli.md
@@ -238,7 +238,7 @@ Optional Arguments:
                                 asking for a compressed response. Usually better
                                 for bandwidth but at the cost of more CPU work.
   --connect-timeout CONNECT_TIMEOUT
-                                Maxium socket connection time to host. Defaults
+                                Maximum socket connection time to host. Defaults
                                 to `5`.
   --domain-parallelism DOMAIN_PARALLELISM
                                 Max number of urls per domain to hit at the same
@@ -419,7 +419,7 @@ Optional Arguments:
                                 asking for a compressed response. Usually better
                                 for bandwidth but at the cost of more CPU work.
   --connect-timeout CONNECT_TIMEOUT
-                                Maxium socket connection time to host. Defaults
+                                Maximum socket connection time to host. Defaults
                                 to `5`.
   -C, --content-filter CONTENT_FILTER
                                 Regex used to filter fetched content.
@@ -1622,7 +1622,7 @@ Optional Arguments:
                                 AMP urls when normalizing url. Defaults to
                                 `True`.
   --platform-aware              Whether url parsing should know about some
-                                specififc platform such as Facebook, YouTube
+                                specific platform such as Facebook, YouTube
                                 etc. into account when normalizing urls. Note
                                 that this is different than activating
                                 --facebook or --youtube.
@@ -2581,7 +2581,7 @@ Optional Arguments:
                                 asking for a compressed response. Usually better
                                 for bandwidth but at the cost of more CPU work.
   --connect-timeout CONNECT_TIMEOUT
-                                Maxium socket connection time to host. Defaults
+                                Maximum socket connection time to host. Defaults
                                 to `15`.
   --domain-parallelism DOMAIN_PARALLELISM
                                 Max number of urls per domain to hit at the same

diff --git a/docs/crawlers.md b/docs/crawlers.md
@@ -254,7 +254,7 @@ class MySpider(Spider):
 - **persistent_storage_path** *Optional[str]*: path to a folder that will contain persistent on-disk resources for the crawler's queue and url cache. If not given, the crawler will work entirely in-memory, which means memory could be exceeded if the url queue or cache becomes too large and you also won't be able to resume if your python process crashes.
 - **resume** *bool* `False`: whether to attempt to resume from persistent storage. Will raise if `persistent_storage_path=None`.
 - **visit_urls_only_once** *bool* `False`: whether to guarantee the crawler won't visit the same url twice.
-- **normalized_url_cache** *bool* `False`: whether to use [`ural.normalize_url`](https://github.com/medialab/ural#normalize_url) before adding a url to the crawler's cache. This can be handy to avoid visting a same page having subtly different urls twice. This will do nothing if `visit_urls_only_once=False`.
+- **normalized_url_cache** *bool* `False`: whether to use [`ural.normalize_url`](https://github.com/medialab/ural#normalize_url) before adding a url to the crawler's cache. This can be handy to avoid visiting a same page having subtly different urls twice. This will do nothing if `visit_urls_only_once=False`.
 - **max_depth** *Optional[int]*: global maximum allowed depth for the crawler to accept a job.
 - **writer_root_directory** *Optional[str]*: root directory that will be used to resolve path written by the crawler's own threadsafe file writer.
 - **sqlar** *bool* `False`: whether the crawler's threadsafe file writer should target a [sqlar](https://www.sqlite.org/sqlar/doc/trunk/README.md) archive instead.
@@ -272,7 +272,7 @@ class MySpider(Spider):
 - **retryer_kwargs** *Optional[dict]*: arguments that will be given to [create_reques.t_retryer](./web.md#create_request_retryer) to create the retryer for each of the spawned threads.
 - **request_args** *Optional[Callable[[T], dict]]*: function returning arguments that will be given to the threaded [request](./web.md#request) call for a given item from the iterable.
 - **use_pycurl** *bool* `False`: whether to use [`pycurl`](http://pycurl.io/) instead of [`urllib3`](https://urllib3.readthedocs.io/en/stable/) to perform the request. The `pycurl` library must be installed for this kwarg to work.
-- **compressed** *bool* `False`: whether to automatically specifiy the `Accept` header to ask the server to compress the response's body on the wire.
+- **compressed** *bool* `False`: whether to automatically specify the `Accept` header to ask the server to compress the response's body on the wire.
 - **known_encoding** *Optional[str]*: encoding of the body of requested urls. Defaults to `None` which means this encoding will be inferred from the body itself.
 - **max_redirects** *int* `5`: maximum number of redirections the request will be allowed to follow before raising an error.
 - **stateful_redirects** *bool* `False`: whether to allow the resolver to be stateful and store cookies along the redirection chain. This is useful when dealing with GDPR compliance patterns from websites etc. but can hurt performance a little bit.
@@ -330,7 +330,7 @@ for result, written_path in crawler.crawl(callback=callback):
 
 *Arguments*
 
-- **callback** *Optional[Callable[[Crawler, SuccessfulCrawlResult], T]]*: callback that can be used to perform IO-intensive tasks within the same thread used for peforming the crawler's request and to return additional information. If callback is given, the iterator returned by the method will yield `(result, callback_result)` instead of just `result`. Note that this callback must be threadsafe.
+- **callback** *Optional[Callable[[Crawler, SuccessfulCrawlResult], T]]*: callback that can be used to perform IO-intensive tasks within the same thread used for performing the crawler's request and to return additional information. If callback is given, the iterator returned by the method will yield `(result, callback_result)` instead of just `result`. Note that this callback must be threadsafe.
 
 #### enqueue
 
@@ -446,7 +446,7 @@ class MySpider(Spider):
 
 Method that must be implemented for the spider to be able to process the crawler's completed jobs.
 
-The method takes a [CrawlJob](#crawljob) instance, a HTTP [Response](./web.md#response) and must return either `None` or a 2-tuple containing: 1. some optional & arbitraty data extracted from the response, 2. an iterable of next targets for the crawler to enqueue.
+The method takes a [CrawlJob](#crawljob) instance, a HTTP [Response](./web.md#response) and must return either `None` or a 2-tuple containing: 1. some optional & arbitrary data extracted from the response, 2. an iterable of next targets for the crawler to enqueue.
 
 Note that next crawl targets can be relative (they will be resolved wrt current's job last redirected url) and that their depth, if not provided, will default to the current job's depth + 1.
 
@@ -528,7 +528,7 @@ Those jobs are also provided to spider's processing functions and can be accesse
 
 - **job** *[CrawlJob](#crawljob)*: job that was completed or errored.
 - **data** *Optional[T]*: data extracted by the spider for the job.
-- **error** *Optional[Exception]*: error that happend when requesting the job's url.
+- **error** *Optional[Exception]*: error that happened when requesting the job's url.
 - **error_code** *Optional[str]*: human-readable error code if an error happened when requesting the job's url.
 - **response** *Optional[[Response](./web.md#response)]*: HTTP response if the job did not error.
 - **degree** *int*: number of new jobs enqueued when processing this job.

diff --git a/docs/executors.md b/docs/executors.md
@@ -107,7 +107,7 @@ Download urls as fast as possible. Yields [RequestResult](#requestresult) object
 - **callback** *Optional[Callable[[T, str, Response], C]]*: callback that can be used to perform IO-intensive tasks within the same thread used for the request and to return additional information. If callback is given, the iterator returned by the pool will yield `(result, callback_result)` instead of just `result`. Note that this callback must be threadsafe.
 - **request_args** *Optional[Callable[[T], dict]]*: function returning arguments that will be given to the threaded [request](./web.md#request) call for a given item from the iterable.
 - **use_pycurl** *bool* `False`: whether to use [`pycurl`](http://pycurl.io/) instead of [`urllib3`](https://urllib3.readthedocs.io/en/stable/) to perform the request. The `pycurl` library must be installed for this kwarg to work.
-- **compressed** *bool* `False`: whether to automatically specifiy the `Accept` header to ask the server to compress the response's body on the wire.
+- **compressed** *bool* `False`: whether to automatically specify the `Accept` header to ask the server to compress the response's body on the wire.
 - **throttle** *float* `0.2`: time to wait, in seconds, between two calls to the same domain.
 - **buffer_size** *int* `1024`: number of items to pull ahead of time from the iterable in hope of finding some url that can be requested immediately. Decreasing this number will ease up memory usage but can slow down overall performance.
 - **domain_parallelism** *int* `1`: maximum number of concurrent calls allowed on a same domain.
@@ -127,7 +127,7 @@ Resolve urls as fast as possible. Yields [ResolveResult](#resolveresult) objects
 - **callback** *Optional[Callable[[T, str, Response], C]]*: callback that can be used to perform IO-intensive tasks within the same thread used for the request and to return additional information. If callback is given, the iterator returned by the pool will yield `(result, callback_result)` instead of just `result`. Note that this callback must be threadsafe.
 - **resolve_args** *Optional[Callable[[T], dict]]*: function returning arguments that will be given to the threaded [resolve](./web.md#resolve) call for a given item from the iterable.
 - **use_pycurl** *bool* `False`: whether to use [`pycurl`](http://pycurl.io/) instead of [`urllib3`](https://urllib3.readthedocs.io/en/stable/) to perform the request. The `pycurl` library must be installed for this kwarg to work.
-- **compressed** *bool* `False`: whether to automatically specifiy the `Accept` header to ask the server to compress the response's body on the wire.
+- **compressed** *bool* `False`: whether to automatically specify the `Accept` header to ask the server to compress the response's body on the wire.
 - **throttle** *float* `0.2`: time to wait, in seconds, between two calls to the same domain.
 - **buffer_size** *int* `1024`: number of items to pull ahead of time from the iterable in hope of finding some url that can be requested immediately. Decreasing this number will ease up memory usage but can slow down overall performance.
 - **domain_parallelism** *int* `1`: maximum number of concurrent calls allowed on a same domain.
@@ -145,7 +145,7 @@ Resolve urls as fast as possible. Yields [ResolveResult](#resolveresult) objects
 
 - **item** *str | T*: item from the iterable given to [request](#request).
 - **url** *Optional[str]*: url for the wrapped item, if any.
-- **error** *Optional[Exception]*: any error that was raised when peforming the HTTP request.
+- **error** *Optional[Exception]*: any error that was raised when performing the HTTP request.
 - **error_code** *Optional[str]*: human-readable error code if any error was raised when performing the HTTP request.
 - **response** *Optional[[Response](./web.md#response)]*: the completed response, if no error was raised.
 
@@ -183,7 +183,7 @@ assert successful_result.response is not None
 
 - **item** *str | T*: item from the iterable given to [resolve](#resolve).
 - **url** *Optional[str]*: url for the wrapped item, if any.
-- **error** *Optional[Exception]*: any error that was raised when peforming the HTTP request.
+- **error** *Optional[Exception]*: any error that was raised when performing the HTTP request.
 - **error_code** *Optional[str]*: human-readable error code if any error was raised when performing the HTTP request.
 - **stack** *Optional[List[[Redirection](./web.md#redirection)]]*: the redirection stack if no error was raised.
 

diff --git a/docs/web.md b/docs/web.md
@@ -55,7 +55,7 @@ response = request(
 - **raise_on_statuses** *Optional[Container[int]]*: if given, request will raise if the response has a status in the given set, instead of returning the response.
 - **stateful** *bool* `False`: whether to allow the resolver to be stateful and store cookies along the redirection chain. This is useful when dealing with GDPR compliance patterns from websites etc. but can hurt performance a little bit.
 - **use_pycurl** *bool* `False`: whether to use [`pycurl`](http://pycurl.io/) instead of [`urllib3`](https://urllib3.readthedocs.io/en/stable/) to perform the request. The `pycurl` library must be installed for this kwarg to work.
-- **compressed** *bool* `False`: whether to automatically specifiy the `Accept` header to ask the server to compress the response's body on the wire.
+- **compressed** *bool* `False`: whether to automatically specify the `Accept` header to ask the server to compress the response's body on the wire.
 - **pool_manager** *Optional[urllib3.PoolManager]*: urllib3 pool manager to use to perform the request. Will use a default sensible pool manager if not given. This should only be cared about when you want to use a custom pool manager. This will not be used if `pycurl=True`.
 
 ## resolve

diff --git a/minet/cli/crawl/__init__.py b/minet/cli/crawl/__init__.py
@@ -113,7 +113,7 @@
     },
     "connect_timeout": {
         "flag": "--connect-timeout",
-        "help": "Maxium socket connection time to host.",
+        "help": "Maximum socket connection time to host.",
         "type": float,
         "default": 5,
     },

diff --git a/minet/cli/url_parse/__init__.py b/minet/cli/url_parse/__init__.py
@@ -226,7 +226,7 @@ def __call__(self, parser, cli_args, values, option_string=None):
         },
         {
             "flag": "--platform-aware",
-            "help": "Whether url parsing should know about some specififc platform such as Facebook, YouTube etc. into account when normalizing urls. Note that this is different than activating --facebook or --youtube.",
+            "help": "Whether url parsing should know about some specific platform such as Facebook, YouTube etc. into account when normalizing urls. Note that this is different than activating --facebook or --youtube.",
             "action": "store_true",
         },
     ],

diff --git a/minet/exceptions.py b/minet/exceptions.py
@@ -212,7 +212,7 @@ class PycurlProtocolError(PycurlError):
     pass
 
 
-# NOTE: we cannot distinguish connexion error and unknown host errors
+# NOTE: we cannot distinguish connection error and unknown host errors
 # This is the reason why `PycurlHostResolutionError` inherits from
 # `PycurlProtocolError` so we can retry it.
 class PycurlHostResolutionError(PycurlProtocolError):