Tutorial 4: How to Analyse OWI Data - Using Jupyter Notebooks#
In this tutorial we aim to pull data via owilix and analyse it using standard python based tools
Pulling a small collection#
In a first step, we pull the legal collection using owilix. Make sure owilix is installed in the environment.
# execute a bash script here.
# If you encounter problems here, run it from the terminal manually.
!owilix --yes remote pull all:latest#5/collectionName=curlie_full num_threads=1
Fetching datasets for specifier all:latest#5/collectionName=curlie_full
2025-09-24 10:18:38,763 - owilix - ERROR - Could not list datasets in datacenter it4i: AsyncClient.__init__() got an unexpected keyword argument 'proxies'
Traceback (most recent call last):
File "/Users/mgrani/bin/miniforge3/envs/owi/lib/python3.11/site-packages/owilix/core/repository.py", line 549, in list
repo_result, elapsed_time = future.result()
^^^^^^^^^^^^^^^
File "/Users/mgrani/bin/miniforge3/envs/owi/lib/python3.11/concurrent/futures/_base.py", line 449, in result
return self.__get_result()
^^^^^^^^^^^^^^^^^^^
File "/Users/mgrani/bin/miniforge3/envs/owi/lib/python3.11/concurrent/futures/_base.py", line 401, in __get_result
raise self._exception
File "/Users/mgrani/bin/miniforge3/envs/owi/lib/python3.11/concurrent/futures/thread.py", line 58, in run
result = self.fn(*self.args, **self.kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/Users/mgrani/bin/miniforge3/envs/owi/lib/python3.11/site-packages/owilix/core/repository.py", line 502, in _list_repo_wrapper
result = repo.list(access, day, duration, query, cb_progress)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/Users/mgrani/bin/miniforge3/envs/owi/lib/python3.11/site-packages/owilix/core/repository.py", line 1554, in list
return self._list(access, day, duration, query, cb_progress, filter_zone=True)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/Users/mgrani/bin/miniforge3/envs/owi/lib/python3.11/site-packages/owilix/core/repository.py", line 1531, in _list
ds_records = self._lexis_ds_api.get_all_datasets(access=access, project=self.manager.name)
^^^^^^^^^^^^^^^^^^
File "/Users/mgrani/bin/miniforge3/envs/owi/lib/python3.11/site-packages/owilix/core/repository.py", line 1492, in _lexis_ds_api
self.__lexis_ds_api = OWILexisDatasetAPI(self.manager.session)
^^^^^^^^^^^^^^^^^^^^
File "/Users/mgrani/bin/miniforge3/envs/owi/lib/python3.11/site-packages/owilix/core/manager.py", line 559, in session
self._session = LexisSession(
^^^^^^^^^^^^^
File "/Users/mgrani/bin/miniforge3/envs/owi/lib/python3.11/site-packages/py4lexis/core/session.py", line 172, in __init__
self.__uc = kck_oi(in_cli=in_cli,
^^^^^^^^^^^^^^^^^^^^^
File "/Users/mgrani/bin/miniforge3/envs/owi/lib/python3.11/site-packages/py4lexis/core/kck_session.py", line 62, in __init__
self._oid = KeycloakOpenID(self.url,
^^^^^^^^^^^^^^^^^^^^^^^^
File "/Users/mgrani/bin/miniforge3/envs/owi/lib/python3.11/site-packages/keycloak/keycloak_openid.py", line 127, in __init__
self.connection = ConnectionManager(
^^^^^^^^^^^^^^^^^^
File "/Users/mgrani/bin/miniforge3/envs/owi/lib/python3.11/site-packages/keycloak/connection.py", line 105, in __init__
self.async_s = httpx.AsyncClient(verify=verify, proxies=proxies, cert=cert)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
TypeError: AsyncClient.__init__() got an unexpected keyword argument 'proxies'
[09/24/25 10:18:38] ERROR 2025-09-24 10:18:38,763 - owilix ]8;id=826993;file:///Users/mgrani/bin/miniforge3/envs/owi/lib/python3.11/site-packages/owilix/core/repository.py\repository.py]8;;\:]8;id=965559;file:///Users/mgrani/bin/miniforge3/envs/owi/lib/python3.11/site-packages/owilix/core/repository.py#554\554]8;;\
- ERROR - Could not list datasets
in datacenter it4i:
AsyncClient.__init__() got an
unexpected keyword argument
'proxies'
╭─ Traceback (most recent call ─╮
│ /Users/mgrani/bin/miniforge3/ │
│ envs/owi/lib/python3.11/site- │
│ packages/owilix/core/reposito │
│ ry.py:549 in list │
│ │
│ 546 │ │ │ for future │
│ 547 │ │ │ │ dc = f │
│ 548 │ │ │ │ try: │
│ ❱ 549 │ │ │ │ │ re │
│ 550 │ │ │ │ │ re │
│ 551 │ │ │ │ │ re │
│ len(repo_result)) │
│ 552 │ │ │ │ │ re │
│ │
│ /Users/mgrani/bin/miniforge3/ │
│ envs/owi/lib/python3.11/concu │
│ rrent/futures/_base.py:449 in │
│ result │
│ │
│ 446 │ │ │ │ if self │
│ 447 │ │ │ │ │ rai │
│ 448 │ │ │ │ elif se │
│ ❱ 449 │ │ │ │ │ ret │
│ 450 │ │ │ │ │
│ 451 │ │ │ │ self._c │
│ 452 │
│ │
│ /Users/mgrani/bin/miniforge3/ │
│ envs/owi/lib/python3.11/concu │
│ rrent/futures/_base.py:401 in │
│ __get_result │
│ │
│ 398 │ def __get_result(se │
│ 399 │ │ if self._except │
│ 400 │ │ │ try: │
│ ❱ 401 │ │ │ │ raise s │
│ 402 │ │ │ finally: │
│ 403 │ │ │ │ # Break │
│ 404 │ │ │ │ self = │
│ │
│ /Users/mgrani/bin/miniforge3/ │
│ envs/owi/lib/python3.11/concu │
│ rrent/futures/thread.py:58 in │
│ run │
│ │
│ 55 │ │ │ return │
│ 56 │ │ │
│ 57 │ │ try: │
│ ❱ 58 │ │ │ result = se │
│ 59 │ │ except BaseExce │
│ 60 │ │ │ self.future │
│ 61 │ │ │ # Break a r │
│ │
│ /Users/mgrani/bin/miniforge3/ │
│ envs/owi/lib/python3.11/site- │
│ packages/owilix/core/reposito │
│ ry.py:502 in │
│ _list_repo_wrapper │
│ │
│ 499 │ │ │ │ │ │ │
│ 500 │ │ │ │ │ │ │
│ 501 │ │ start = time.t │
│ ❱ 502 │ │ result = repo. │
│ 503 │ │ elapsed_time = │
│ 504 │ │ return result, │
│ 505 │
│ │
│ /Users/mgrani/bin/miniforge3/ │
│ envs/owi/lib/python3.11/site- │
│ packages/owilix/core/reposito │
│ ry.py:1554 in list │
│ │
│ 1551 │ │ query=None, │
│ 1552 │ │ cb_progress=No │
│ 1553 │ ) -> list: │
│ ❱ 1554 │ │ return self._l │
│ 1555 │ │
│ 1556 │ │
│ 1557 │ def files(self, da │
│ List[str]: │
│ │
│ /Users/mgrani/bin/miniforge3/ │
│ envs/owi/lib/python3.11/site- │
│ packages/owilix/core/reposito │
│ ry.py:1531 in _list │
│ │
│ 1528 │ │ cb_progress=No │
│ 1529 │ │ filter_zone=Tr │
│ 1530 │ ) -> list: │
│ ❱ 1531 │ │ ds_records = s │
│ project=self.manager.n │
│ 1532 │ │ if not ds_reco │
│ 1533 │ │ │ return [] │
│ 1534 │ │ if filter_zone │
│ │
│ /Users/mgrani/bin/miniforge3/ │
│ envs/owi/lib/python3.11/site- │
│ packages/owilix/core/reposito │
│ ry.py:1492 in _lexis_ds_api │
│ │
│ 1489 │ @property │
│ 1490 │ def _lexis_ds_api( │
│ 1491 │ │ if self.__lexi │
│ ❱ 1492 │ │ │ self.__lex │
│ 1493 │ │ return self.__ │
│ 1494 │ │
│ 1495 │ def _init_irods(se │
│ │
│ /Users/mgrani/bin/miniforge3/ │
│ envs/owi/lib/python3.11/site- │
│ packages/owilix/core/manager. │
│ py:559 in session │
│ │
│ 556 │ │ │ │ │ _re │
│ 557 │ │ │ │ │ try │
│ 558 │ │ │ │ │ │ │
│ ❱ 559 │ │ │ │ │ │ │
│ 560 │ │ │ │ │ │ │
│ 561 │ │ │ │ │ │ │
│ 562 │ │ │ │ │ │ │
│ │
│ /Users/mgrani/bin/miniforge3/ │
│ envs/owi/lib/python3.11/site- │
│ packages/py4lexis/core/sessio │
│ n.py:172 in __init__ │
│ │
│ 169 │ │ if login_method │
│ 170 │ │ │ offline_acc │
│ 171 │ │ │
│ ❱ 172 │ │ self.__uc = kck │
│ 173 │ │ │ │ │ │ │
│ 174 │ │ │ │ │ │ │
│ 175 │ │ │ │ │ │ │
│ │
│ /Users/mgrani/bin/miniforge3/ │
│ envs/owi/lib/python3.11/site- │
│ packages/py4lexis/core/kck_se │
│ ssion.py:62 in __init__ │
│ │
│ 59 │ │ │
│ 60 │ │ self._offline_a │
│ 61 │ │ │
│ ❱ 62 │ │ self._oid = Key │
│ 63 │ │ │ │ │ │ │
│ 64 │ │ │ │ │ │ │
│ 65 │ │ │ │ │ │ │
│ │
│ /Users/mgrani/bin/miniforge3/ │
│ envs/owi/lib/python3.11/site- │
│ packages/keycloak/keycloak_op │
│ enid.py:127 in __init__ │
│ │
│ 124 │ │ self.client_se │
│ 125 │ │ self.realm_nam │
│ 126 │ │ headers = cust │
│ ❱ 127 │ │ self.connectio │
│ 128 │ │ │ base_url=s │
│ 129 │ │ │ headers=he │
│ 130 │ │ │ timeout=ti │
│ │
│ /Users/mgrani/bin/miniforge3/ │
│ envs/owi/lib/python3.11/site- │
│ packages/keycloak/connection. │
│ py:105 in __init__ │
│ │
│ 102 │ │ if proxies: │
│ 103 │ │ │ self._s.pro │
│ 104 │ │ │
│ ❱ 105 │ │ self.async_s = │
│ 106 │ │ self.async_s.au │
│ 107 │ │ self.async_s.tr │
│ 108 │
╰───────────────────────────────╯
TypeError: AsyncClient.__init__()
got an unexpected keyword
argument 'proxies'
2025-09-24 10:18:39,313 - owilix - ERROR - Could not list datasets in datacenter lrz: AsyncClient.__init__() got an unexpected keyword argument 'proxies'
Traceback (most recent call last):
File "/Users/mgrani/bin/miniforge3/envs/owi/lib/python3.11/site-packages/owilix/core/repository.py", line 549, in list
repo_result, elapsed_time = future.result()
^^^^^^^^^^^^^^^
File "/Users/mgrani/bin/miniforge3/envs/owi/lib/python3.11/concurrent/futures/_base.py", line 449, in result
return self.__get_result()
^^^^^^^^^^^^^^^^^^^
File "/Users/mgrani/bin/miniforge3/envs/owi/lib/python3.11/concurrent/futures/_base.py", line 401, in __get_result
raise self._exception
File "/Users/mgrani/bin/miniforge3/envs/owi/lib/python3.11/concurrent/futures/thread.py", line 58, in run
result = self.fn(*self.args, **self.kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/Users/mgrani/bin/miniforge3/envs/owi/lib/python3.11/site-packages/owilix/core/repository.py", line 502, in _list_repo_wrapper
result = repo.list(access, day, duration, query, cb_progress)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/Users/mgrani/bin/miniforge3/envs/owi/lib/python3.11/site-packages/owilix/core/repository.py", line 1554, in list
return self._list(access, day, duration, query, cb_progress, filter_zone=True)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/Users/mgrani/bin/miniforge3/envs/owi/lib/python3.11/site-packages/owilix/core/repository.py", line 1531, in _list
ds_records = self._lexis_ds_api.get_all_datasets(access=access, project=self.manager.name)
^^^^^^^^^^^^^^^^^^
File "/Users/mgrani/bin/miniforge3/envs/owi/lib/python3.11/site-packages/owilix/core/repository.py", line 1492, in _lexis_ds_api
self.__lexis_ds_api = OWILexisDatasetAPI(self.manager.session)
^^^^^^^^^^^^^^^^^^^^
File "/Users/mgrani/bin/miniforge3/envs/owi/lib/python3.11/site-packages/owilix/core/manager.py", line 559, in session
self._session = LexisSession(
^^^^^^^^^^^^^
File "/Users/mgrani/bin/miniforge3/envs/owi/lib/python3.11/site-packages/py4lexis/core/session.py", line 172, in __init__
self.__uc = kck_oi(in_cli=in_cli,
^^^^^^^^^^^^^^^^^^^^^
File "/Users/mgrani/bin/miniforge3/envs/owi/lib/python3.11/site-packages/py4lexis/core/kck_session.py", line 62, in __init__
self._oid = KeycloakOpenID(self.url,
^^^^^^^^^^^^^^^^^^^^^^^^
File "/Users/mgrani/bin/miniforge3/envs/owi/lib/python3.11/site-packages/keycloak/keycloak_openid.py", line 127, in __init__
self.connection = ConnectionManager(
^^^^^^^^^^^^^^^^^^
File "/Users/mgrani/bin/miniforge3/envs/owi/lib/python3.11/site-packages/keycloak/connection.py", line 105, in __init__
self.async_s = httpx.AsyncClient(verify=verify, proxies=proxies, cert=cert)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
TypeError: AsyncClient.__init__() got an unexpected keyword argument 'proxies'
[09/24/25 10:18:39] ERROR 2025-09-24 10:18:39,313 - owilix ]8;id=395044;file:///Users/mgrani/bin/miniforge3/envs/owi/lib/python3.11/site-packages/owilix/core/repository.py\repository.py]8;;\:]8;id=71980;file:///Users/mgrani/bin/miniforge3/envs/owi/lib/python3.11/site-packages/owilix/core/repository.py#554\554]8;;\
- ERROR - Could not list datasets
in datacenter lrz:
AsyncClient.__init__() got an
unexpected keyword argument
'proxies'
╭─ Traceback (most recent call ─╮
│ /Users/mgrani/bin/miniforge3/ │
│ envs/owi/lib/python3.11/site- │
│ packages/owilix/core/reposito │
│ ry.py:549 in list │
│ │
│ 546 │ │ │ for future │
│ 547 │ │ │ │ dc = f │
│ 548 │ │ │ │ try: │
│ ❱ 549 │ │ │ │ │ re │
│ 550 │ │ │ │ │ re │
│ 551 │ │ │ │ │ re │
│ len(repo_result)) │
│ 552 │ │ │ │ │ re │
│ │
│ /Users/mgrani/bin/miniforge3/ │
│ envs/owi/lib/python3.11/concu │
│ rrent/futures/_base.py:449 in │
│ result │
│ │
│ 446 │ │ │ │ if self │
│ 447 │ │ │ │ │ rai │
│ 448 │ │ │ │ elif se │
│ ❱ 449 │ │ │ │ │ ret │
│ 450 │ │ │ │ │
│ 451 │ │ │ │ self._c │
│ 452 │
│ │
│ /Users/mgrani/bin/miniforge3/ │
│ envs/owi/lib/python3.11/concu │
│ rrent/futures/_base.py:401 in │
│ __get_result │
│ │
│ 398 │ def __get_result(se │
│ 399 │ │ if self._except │
│ 400 │ │ │ try: │
│ ❱ 401 │ │ │ │ raise s │
│ 402 │ │ │ finally: │
│ 403 │ │ │ │ # Break │
│ 404 │ │ │ │ self = │
│ │
│ /Users/mgrani/bin/miniforge3/ │
│ envs/owi/lib/python3.11/concu │
│ rrent/futures/thread.py:58 in │
│ run │
│ │
│ 55 │ │ │ return │
│ 56 │ │ │
│ 57 │ │ try: │
│ ❱ 58 │ │ │ result = se │
│ 59 │ │ except BaseExce │
│ 60 │ │ │ self.future │
│ 61 │ │ │ # Break a r │
│ │
│ /Users/mgrani/bin/miniforge3/ │
│ envs/owi/lib/python3.11/site- │
│ packages/owilix/core/reposito │
│ ry.py:502 in │
│ _list_repo_wrapper │
│ │
│ 499 │ │ │ │ │ │ │
│ 500 │ │ │ │ │ │ │
│ 501 │ │ start = time.t │
│ ❱ 502 │ │ result = repo. │
│ 503 │ │ elapsed_time = │
│ 504 │ │ return result, │
│ 505 │
│ │
│ /Users/mgrani/bin/miniforge3/ │
│ envs/owi/lib/python3.11/site- │
│ packages/owilix/core/reposito │
│ ry.py:1554 in list │
│ │
│ 1551 │ │ query=None, │
│ 1552 │ │ cb_progress=No │
│ 1553 │ ) -> list: │
│ ❱ 1554 │ │ return self._l │
│ 1555 │ │
│ 1556 │ │
│ 1557 │ def files(self, da │
│ List[str]: │
│ │
│ /Users/mgrani/bin/miniforge3/ │
│ envs/owi/lib/python3.11/site- │
│ packages/owilix/core/reposito │
│ ry.py:1531 in _list │
│ │
│ 1528 │ │ cb_progress=No │
│ 1529 │ │ filter_zone=Tr │
│ 1530 │ ) -> list: │
│ ❱ 1531 │ │ ds_records = s │
│ project=self.manager.n │
│ 1532 │ │ if not ds_reco │
│ 1533 │ │ │ return [] │
│ 1534 │ │ if filter_zone │
│ │
│ /Users/mgrani/bin/miniforge3/ │
│ envs/owi/lib/python3.11/site- │
│ packages/owilix/core/reposito │
│ ry.py:1492 in _lexis_ds_api │
│ │
│ 1489 │ @property │
│ 1490 │ def _lexis_ds_api( │
│ 1491 │ │ if self.__lexi │
│ ❱ 1492 │ │ │ self.__lex │
│ 1493 │ │ return self.__ │
│ 1494 │ │
│ 1495 │ def _init_irods(se │
│ │
│ /Users/mgrani/bin/miniforge3/ │
│ envs/owi/lib/python3.11/site- │
│ packages/owilix/core/manager. │
│ py:559 in session │
│ │
│ 556 │ │ │ │ │ _re │
│ 557 │ │ │ │ │ try │
│ 558 │ │ │ │ │ │ │
│ ❱ 559 │ │ │ │ │ │ │
│ 560 │ │ │ │ │ │ │
│ 561 │ │ │ │ │ │ │
│ 562 │ │ │ │ │ │ │
│ │
│ /Users/mgrani/bin/miniforge3/ │
│ envs/owi/lib/python3.11/site- │
│ packages/py4lexis/core/sessio │
│ n.py:172 in __init__ │
│ │
│ 169 │ │ if login_method │
│ 170 │ │ │ offline_acc │
│ 171 │ │ │
│ ❱ 172 │ │ self.__uc = kck │
│ 173 │ │ │ │ │ │ │
│ 174 │ │ │ │ │ │ │
│ 175 │ │ │ │ │ │ │
│ │
│ /Users/mgrani/bin/miniforge3/ │
│ envs/owi/lib/python3.11/site- │
│ packages/py4lexis/core/kck_se │
│ ssion.py:62 in __init__ │
│ │
│ 59 │ │ │
│ 60 │ │ self._offline_a │
│ 61 │ │ │
│ ❱ 62 │ │ self._oid = Key │
│ 63 │ │ │ │ │ │ │
│ 64 │ │ │ │ │ │ │
│ 65 │ │ │ │ │ │ │
│ │
│ /Users/mgrani/bin/miniforge3/ │
│ envs/owi/lib/python3.11/site- │
│ packages/keycloak/keycloak_op │
│ enid.py:127 in __init__ │
│ │
│ 124 │ │ self.client_se │
│ 125 │ │ self.realm_nam │
│ 126 │ │ headers = cust │
│ ❱ 127 │ │ self.connectio │
│ 128 │ │ │ base_url=s │
│ 129 │ │ │ headers=he │
│ 130 │ │ │ timeout=ti │
│ │
│ /Users/mgrani/bin/miniforge3/ │
│ envs/owi/lib/python3.11/site- │
│ packages/keycloak/connection. │
│ py:105 in __init__ │
│ │
│ 102 │ │ if proxies: │
│ 103 │ │ │ self._s.pro │
│ 104 │ │ │
│ ❱ 105 │ │ self.async_s = │
│ 106 │ │ self.async_s.au │
│ 107 │ │ self.async_s.tr │
│ 108 │
╰───────────────────────────────╯
TypeError: AsyncClient.__init__()
got an unexpected keyword
argument 'proxies'
No data available to display.
Listing available parquet files#
Per default owilix puts the data under ~/.owi/public/<collectionNAme> where collection name is legalin our case.
The directory has a sub-directory per dataset as well as a .json file per dataset containing metadata about the dataset
Reading and Printing Dataset Metadata#
In a first step, we simly read the metadata and print it as pandas dataframe.
import os
import json
import pandas as pd
# Path to your directory with JSON files
dir_path = os.path.expanduser('~/.owi/public/')
# List to store JSON data
data = []
# Iterate over all files in the directory
for filename in os.listdir(dir_path):
# Check if the file is a JSON file
if filename.endswith(".json"):
# Open and read the JSON file
with open(os.path.join(dir_path, filename), 'r') as file:
# Load the JSON data into a Python object
json_data = json.load(file)
# Create a new dictionary to hold the scalar values
scalar_data = {}
for key, value in json_data.items():
if not isinstance(value, (list, dict)):
scalar_data[key] = value
# Append the scalar data to the list
data.append(scalar_data)
# Create a pandas DataFrame from the JSON data
df = pd.DataFrame(data)
# Display the DataFrame
print(df[["title","collectionName", "startDate"]])
---------------------------------------------------------------------------
FileNotFoundError Traceback (most recent call last)
Cell In[8], line 11
9 data = []
10 # Iterate over all files in the directory
---> 11 for filename in os.listdir(dir_path):
12 # Check if the file is a JSON file
13 if filename.endswith(".json"):
14 # Open and read the JSON file
15 with open(os.path.join(dir_path, filename), 'r') as file:
16 # Load the JSON data into a Python object
FileNotFoundError: [Errno 2] No such file or directory: '/Users/mgrani/.owi/public/'
Reading Parquet files and esimtating statistics#
The main content is contained in parquet files contained in the subdirectory of all datasets. To access those files we first write a function collecting all parquet files with a certain path pattern. In our case we aim for files contained in language=eng sub-folders, as these are pages identified as english.
import os
def collect_parquet_files(directory, pattern):
"""
Collects all parquet files with a certain path pattern.
Args:
directory (str): The directory to start searching from.
pattern (str): The pattern to match in the file path. For example, 'language=eng/*.parquet'.
Returns:
list: A list of file paths matching the pattern.
"""
parquet_files = []
for root, dirs, files in os.walk(directory):
for file in files:
if pattern in root and file.endswith('.parquet'):
parquet_files.append(os.path.join(root, file))
return parquet_files
# Example usage:
directory = dir_path
pattern = 'language=eng'
parquet_files = collect_parquet_files(directory, pattern)
print(f"found {len(parquet_files)} files")
found 1104 files
Accessing the data as pandas dataframe#
We now load the first file as dataframe and print its content:
pd.read_parquet(parquet_files[0])
| id | record_id | title | description | keywords | author | main_content | json-ld | microdata | opengraph | ... | ows_referer | ows_resource_type | ows_tags | outgoing_links | image_links | video_links | iframes | curlielabels | curlielabels_en | address | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0007ee8e62be26f1a37bb9ad8aaf70cab53e97a6ab2c75... | e07bf2c2-5608-446d-97e6-31b8f1a4f6e8 | None | None | None | None | <p>{{Géolocalisation/Projection équirectangula... | None | None | None | ... | None | article2warc.py | None | None | None | None | None | [World/Français/Références/Encyclopédies] | None | None |
| 1 | 0008c736a14d2e9bfce27281d0c4e7d25e21441c5b1607... | 448bce6f-c823-49f7-ba1e-bdfdaf0927e5 | None | None | None | None | <p>{{ébauche|film américain}}</p>\n\n<p>{{Info... | None | None | None | ... | None | article2warc.py | None | [{'src': 'https://fr.wikipedia.org/wiki/morgan... | None | None | None | [World/Français/Références/Encyclopédies] | None | None |
| 2 | 002dd91e8dcf0032ef82136cfe0bfcce432c1b4c8be768... | 59274f37-42ef-4eaa-8cdf-6d6ce9a01780 | None | None | None | None | <p>{{Ébauche|album}}</p>\n\n<p>{{Voir homonyme... | None | None | None | ... | None | article2warc.py | None | [{'src': 'https://fr.wikipedia.org/wiki/away-f... | None | None | None | [World/Français/Références/Encyclopédies] | None | None |
| 3 | 003584f824bc60f0c63889841f75263832b84824eeb2d2... | f2d7cd89-9f24-4904-b18f-cf0e7b213c7e | None | None | None | None | <p>{{voir homonymes|Schlicke}}</p>\n\n<p>{{éba... | None | None | None | ... | None | article2warc.py | None | [{'src': 'https://fr.wikipedia.org/wiki/catego... | None | None | None | [World/Français/Références/Encyclopédies] | None | None |
| 4 | 003869670137923b1c95fcd10a696248d655bbba4eebe0... | 27651db6-b335-482d-8a0e-702e10366d82 | None | None | None | None | <p>{{Boîte déroulante/début|titre=<a href="htt... | None | None | None | ... | None | article2warc.py | None | [{'src': 'https://fr.wikipedia.org/wiki/châtea... | None | None | None | [World/Français/Références/Encyclopédies] | None | None |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 8589 | ffe65750b0185a95fd02bf7742e581d66bd72e562ec214... | afb2eaab-11fe-41f4-865f-1a51eb04eed2 | None | None | None | None | <p>{{voir homonymes|Translation}}</p>\n\n<p>{{... | None | None | None | ... | None | article2warc.py | None | [{'src': 'https://fr.wikipedia.org/wiki/hip-ho... | None | None | None | [World/Français/Références/Encyclopédies] | None | None |
| 8590 | ffee85bcb3629b370d15d53bbf147168d2f513fffe8002... | fe6c38c1-80ba-48b8-9dee-ff90a9414aaf | None | None | None | None | <p>{{ébauche|localité ouzbèke}}</p>\n\n<p>{{In... | None | None | None | ... | None | article2warc.py | None | [{'src': 'https://fr.wikipedia.org/wiki/ouzbek... | None | None | None | [World/Français/Références/Encyclopédies] | None | None |
| 8591 | fff6e4ff6705269931a12fee69db9bf19d9c4c3841f756... | 13886567-7b9a-4c75-baef-940c8d00ccff | None | None | None | None | <p>require('strict')</p>\n\n<p>local p = {}</p... | None | None | None | ... | None | article2warc.py | None | None | None | None | None | [World/Français/Références/Encyclopédies] | None | None |
| 8592 | fff7611af333cf7fc139174adc826a577b065e9c702be0... | c1ac2982-8333-4a6f-8e04-3f0abfec2ea0 | None | None | None | None | <p>{{ {{{1|country showdata}}}</p>\n\n<p>| ali... | None | None | None | ... | None | article2warc.py | None | None | None | None | None | [World/Français/Références/Encyclopédies] | None | None |
| 8593 | fff9bea1a46c10880bee7ef42676f3da1838369605ff69... | a6102a37-2ab1-42e0-856e-a02917d28e22 | None | None | None | None | <p>{{Ébauche|localité de l'Illinois}}</p>\n\n<... | None | None | None | ... | None | article2warc.py | None | [{'src': 'https://fr.wikipedia.org/wiki/townsh... | None | None | None | [World/Français/Références/Encyclopédies] | None | None |
8594 rows × 47 columns
now we iterate over all files, load every one as pandas frame and count the number of .de urls in the url field, i.e. we look for the regular expression of .de/.
Note that this can take some time, so we limit ourselfs to 10 parquet files
import pandas as pd
import re
def count_de_urls(parquet_files):
"""
Counts the number of .de URLs in the 'url' field of each parquet file.
Args:
parquet_files (list): A list of parquet file paths.
Returns:
dict: A dictionary with the file path as the key and the count of .de URLs as the value.
"""
de_url_counts = {}
for file in parquet_files:
try:
# Load the parquet file as a pandas DataFrame
df = pd.read_parquet(file)
# Use a regular expression to find URLs ending with '.de/' in the 'url' field
de_url_count = df['url'].str.contains(r'.de/', regex=True).sum()
# Store the count in the dictionary
de_url_counts[file] = de_url_count
except Exception as e:
print(f"Error reading file {file}: {e}")
return de_url_counts
# Example usage:
parquet_files = collect_parquet_files(directory, pattern)
de_url_counts = count_de_urls(parquet_files[0:10])
print(de_url_counts)
{'/home/mgrani/.owi/public/legal/0bb8c994-394a-11ef-84f3-0242ac1d0009/year=2023/month=10/day=30/language=eng/metadata_0.parquet': 3, '/home/mgrani/.owi/public/legal/0bb8c994-394a-11ef-84f3-0242ac1d0009/year=2023/month=10/day=24/language=eng/metadata_0.parquet': 14, '/home/mgrani/.owi/public/legal/0bb8c994-394a-11ef-84f3-0242ac1d0009/year=2023/month=10/day=24/year=2023/month=10/day=30/language=eng/metadata_0.parquet': 3, '/home/mgrani/.owi/public/legal/0bb8c994-394a-11ef-84f3-0242ac1d0009/year=2023/month=10/day=24/year=2023/month=10/day=24/language=eng/metadata_0.parquet': 14, '/home/mgrani/.owi/public/legal/0bb8c994-394a-11ef-84f3-0242ac1d0009/year=2023/month=10/day=24/year=2023/month=12/day=29/language=eng/metadata_0.parquet': 2, '/home/mgrani/.owi/public/legal/0bb8c994-394a-11ef-84f3-0242ac1d0009/year=2023/month=10/day=24/year=2023/month=12/day=08/language=eng/metadata_0.parquet': 7, '/home/mgrani/.owi/public/legal/0bb8c994-394a-11ef-84f3-0242ac1d0009/year=2023/month=10/day=24/year=2023/month=12/day=30/language=eng/metadata_0.parquet': 3, '/home/mgrani/.owi/public/legal/0bb8c994-394a-11ef-84f3-0242ac1d0009/year=2023/month=10/day=24/year=2023/month=12/day=06/language=eng/metadata_0.parquet': 4, '/home/mgrani/.owi/public/legal/0bb8c994-394a-11ef-84f3-0242ac1d0009/year=2023/month=10/day=24/year=2023/month=12/day=07/language=eng/metadata_0.parquet': 7, '/home/mgrani/.owi/public/legal/0bb8c994-394a-11ef-84f3-0242ac1d0009/year=2023/month=10/day=24/year=2023/month=12/day=05/language=eng/metadata_0.parquet': 2}
We can alos plot the count.
import pandas as pd
# install matplotlib if needed
import matplotlib.pyplot as plt
def plot_de_url_counts(de_url_counts):
"""
Converts the de_url_counts dictionary into a pandas DataFrame and plots it.
Args:
de_url_counts (dict): A dictionary with the file path as the key and the count of .de URLs as the value.
"""
# Convert the dictionary into a pandas DataFrame
df = pd.DataFrame(list(de_url_counts.items()), columns=['File', 'DE URL Count'])
df['File'] = df['File'].apply(lambda x: x[-20:])
# Plot the DataFrame
plt.figure(figsize=(10, 6))
plt.bar(df['File'], df['DE URL Count'])
plt.xlabel('File')
plt.ylabel('DE URL Count')
plt.title('DE URL Count per File')
plt.xticks(rotation=90) # Rotate the x-axis labels for better readability
plt.tight_layout() # Ensure the labels fit within the figure
plt.show()
# Example usage:
parquet_files = collect_parquet_files(directory, pattern)
de_url_counts = count_de_urls(parquet_files[0:100])
plot_de_url_counts(de_url_counts)
Filtering for content#
While it is not the most efficient way, we can filter out content using a regular expression. For example, we could scan all impressums that contain Germany in the text and create a separate dataframe from it and plot the dataframe
import pandas as pd
import re
def get_filtered_pages(parquet_files, filter_regex=r'.*Germany.*'):
"""
Returns a DataFrame with the filtered pages non-aggregated.
Args:
parquet_files (list): A list of parquet file paths.
Returns:
pandas.DataFrame: A DataFrame with the filtered pages non-aggregated.
"""
filtered_pages = []
for file in parquet_files:
try:
# Load the parquet file as a pandas DataFrame
df = pd.read_parquet(file)
# Use a regular expression to filter pages containing 'Germany' in the 'url' or 'title' fields
filtered_df = df[(df['url'].str.contains(filter_regex, regex=True)) |
(df['title'].str.contains(filter_regex, regex=True))|
(df['plain_text'].str.contains(filter_regex, regex=True))]
# Add the filtered pages to the list
filtered_pages.append(filtered_df)
except Exception as e:
print(f"Error reading file {file}: {e}")
# Concatenate the filtered pages
filtered_pages_df = pd.concat(filtered_pages, ignore_index=True)
return filtered_pages_df
# Example usage:
parquet_files = collect_parquet_files(directory, pattern)
filtered_pages_df = get_filtered_pages(parquet_files[0:400])
print(filtered_pages_df)
url domain_label \
0 https://mezger.eu/impressum_en.html None
1 https://www.corporate-office-contacts.com/kiewit/ None
2 https://chronic.news/big-pharma-loses-billions... None
3 https://germanyworks.com/helpdesk/contact/ None
4 https://www.flyuia.com/sa/en/contacts/corporate Regional
... ... ...
8421 https://beabongiasca.com/pages/shipping-repair... None
8422 https://caiacosmetics.co.uk/uk/info/terms-and-... None
8423 https://www.valmet.com/about-us/contact-us/aut... Business
8424 https://www.glocomms.de/contact-us/frankfurt None
8425 https://actlegal.com/hu/locations/hungary None
title \
0 Impressum - mezger.eu_EN
1 Kiewit Corporate Office [Contact: Phone Number...
2 Big Pharma Loses Billions with Each State That...
3 Contact - GERMANY WORKS.
4 Corporate clients – UIA (Saudi Arabia)
... ...
8421 \n \n Terms & Conditions\n \n \n ...
8422 Terms and Conditions
8423 Automation contact form
8424 Frankfurt
8425 Hungary - act legal
plain_text
0 • de\n • fr\n\nmezger.eu_EN\n\nSkip navigat...
1 Skip to content\nCorporate Office Contacts\n\n...
2 Sunday, May 28 2023\nLatest\n • Health Canada...
3 • Industries\n • Decarbonisation\n • Downl...
4 AGREE\n\nWE VALUE YOUR PRIVACY\n\nOur website ...
... ...
8421 Skip to content\nThis site has limited support...
8422 Current country\nUK UK GBP\n • Sweden Sweden\...
8423 Valmet - Forward\nLogin icon-login\nicon-close...
8424 Glocomms DE\n • Search for jobs\n • Germ...
8425 Bán & Partners\n\n • Főoldal\n • Hírek\n • ...
[8426 rows x 4 columns]
# Create DataFrame
df = pd.DataFrame(filtered_pages_df)
# Count the 'domain_label' occurrences
domain_label_counts = df['domain_label'].value_counts(dropna=False)
# Plot the histogram
plt.figure(figsize=(8, 6))
domain_label_counts.plot(kind='bar')
plt.xlabel('Domain Label')
plt.ylabel('Count')
plt.title('Histogram of Domain Label Counts')
plt.xticks(rotation=45)
plt.show()