import numpy as np
from IPython.core.debugger import set_trace
assemble
These import
s are only meant for testing
Directory where the zip files are stored
= pathlib.Path.cwd().parent / 'samples'
directory assert directory.exists()
directory
PosixPath('/home/manu/Sync/UC3M/proyectos/2022/nextProcurement/sproc/samples')
Parameters
A (sample) file in that directory
= directory / 'PlataformasAgregadasSinMenores_202201_05-06.zip'
input_file assert input_file.exists()
input_file
PosixPath('/home/manu/Sync/UC3M/proyectos/2022/nextProcurement/sproc/samples/PlataformasAgregadasSinMenores_202201_05-06.zip')
It is parsed and turned into a column-multiindexed pd.DataFrame
= sproc.bundle.read_zip(input_file, concatenate=True)
flat_df = sproc.hier.flat_df_to_multiindexed_df(flat_df)
df 2) df.head(
id | summary | title | updated | ContractFolderStatus | |||||||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
ContractFolderID | ContractFolderStatusCode | LocatedContractingParty | ProcurementProject | LegalDocumentReference | ProcurementProject | TechnicalDocumentReference | LocatedContractingParty | TenderingProcess | |||||||||||||||
BuyerProfileURIID | Party | ParentLocatedParty | Name | ... | PlannedPeriod | ID | Attachment | PlannedPeriod | ID | Attachment | ParentLocatedParty | ParticipationRequestReceptionPeriod | TenderSubmissionDeadlinePeriod | ||||||||||
PartyName | PartyName | ... | StartDate | ExternalReference | EndDate | ExternalReference | ParentLocatedParty | EndDate | EndTime | ||||||||||||||
Name | Name | ... | URI | URI | PartyName | ||||||||||||||||||
... | Name | ||||||||||||||||||||||
zip | file name | entry | |||||||||||||||||||||
PlataformasAgregadasSinMenores_202201_05-06.zip | PlataformasAgregadasSinMenores_20220105_030012.atom | 0 | https://contrataciondelestado.es/sindicacion/P... | Id licitación: 2021039438; Órgano de Contratac... | L'objecte és la contractació del servei de bug... | [2022-01-04 12:12:09.464000+00:00] | 2021039438 | [RES] | https://contractaciopublica.gencat.cat/ecofin_... | Ajuntament de Girona | Entitats municipals de Catalunya | L'objecte és la contractació del servei de bug... | ... | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | 2021-12-13 23:59:00+00:00 |
1 | https://contrataciondelestado.es/sindicacion/P... | Id licitación: 2021_2568; Órgano de Contrataci... | Servei comunicacions postals de l'Ajuntament d... | [2022-01-04 12:12:09.400000+00:00] | 2021_2568 | [ADJ] | https://contractaciopublica.gencat.cat/ecofin_... | Ajuntament de Collbató | Entitats municipals de Catalunya | Servei comunicacions postals de l'Ajuntament d... | ... | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | 2021-11-29 23:59:00+00:00 |
2 rows × 39 columns
The last few columns
-2:] df.columns[
MultiIndex([('ContractFolderStatus', 'TenderingProcess', ...),
('ContractFolderStatus', 'TenderingProcess', ...)],
)
Deleted contracts are extracted
= sproc.bundle.read_deleted_zip(input_file)
deleted_series 2) deleted_series.head(
zip file name id
PlataformasAgregadasSinMenores_202201_05-06.zip PlataformasAgregadasSinMenores_20220105_030012.atom https://contrataciondelestado.es/sindicacion/PlataformasAgregadasSinMenores/6724977 2022-01-04 00:12:01.376000+00:00
https://contrataciondelestado.es/sindicacion/PlataformasAgregadasSinMenores/1968315 2022-01-03 23:11:57.567000+00:00
Name: deleted_on, dtype: datetime64[ns, UTC]
Handling deleted entries
A function to merge together a pd.DataFrame
with the actual data and a pd.Series
like the one shown meant to flag some entries as deleted. Ultimately, a new a column, deleted_on
, is added to the former. It can either contain a date or pd.NaT
if the corresponding entry was not deleted.
merge_deleted
merge_deleted (data_df:pandas.core.frame.DataFrame, deleted_series:pandas.core.series.Series)
Adds a new column with the date on which an entry was deleted or pd.NaT
if it was not
Type | Details | |
---|---|---|
data_df | DataFrame | Input data |
deleted_series | Series | Deleted entries |
Returns | DataFrame | Input with new deleted_on column |
= merge_deleted(df, deleted_series)
merged_df 2) merged_df.head(
id | summary | title | updated | ContractFolderStatus | deleted_on | ||||||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
ContractFolderID | ContractFolderStatusCode | LocatedContractingParty | ProcurementProject | ... | LegalDocumentReference | ProcurementProject | TechnicalDocumentReference | LocatedContractingParty | TenderingProcess | ||||||||||||||
BuyerProfileURIID | Party | ParentLocatedParty | Name | ... | ID | Attachment | PlannedPeriod | ID | Attachment | ParentLocatedParty | ParticipationRequestReceptionPeriod | TenderSubmissionDeadlinePeriod | |||||||||||
PartyName | PartyName | ... | ExternalReference | EndDate | ExternalReference | ParentLocatedParty | EndDate | EndTime | |||||||||||||||
Name | Name | ... | URI | URI | PartyName | ||||||||||||||||||
... | Name | ||||||||||||||||||||||
zip | file name | entry | |||||||||||||||||||||
PlataformasAgregadasSinMenores_202201_05-06.zip | PlataformasAgregadasSinMenores_20220105_030012.atom | 0 | https://contrataciondelestado.es/sindicacion/P... | Id licitación: 2021039438; Órgano de Contratac... | L'objecte és la contractació del servei de bug... | [2022-01-04 12:12:09.464000+00:00] | 2021039438 | [RES] | https://contractaciopublica.gencat.cat/ecofin_... | Ajuntament de Girona | Entitats municipals de Catalunya | L'objecte és la contractació del servei de bug... | ... | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | 2021-12-13 23:59:00+00:00 | NaT |
1 | https://contrataciondelestado.es/sindicacion/P... | Id licitación: 2021_2568; Órgano de Contrataci... | Servei comunicacions postals de l'Ajuntament d... | [2022-01-04 12:12:09.400000+00:00] | 2021_2568 | [ADJ] | https://contractaciopublica.gencat.cat/ecofin_... | Ajuntament de Collbató | Entitats municipals de Catalunya | Servei comunicacions postals de l'Ajuntament d... | ... | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | 2021-11-29 23:59:00+00:00 | NaT |
2 rows × 40 columns
For column-wise flat pd.DataFrame
s
2) merge_deleted(flat_df, deleted_series).head(
id | summary | title | updated | ContractFolderStatus - ContractFolderID | ContractFolderStatus - ContractFolderStatusCode | ContractFolderStatus - LocatedContractingParty - BuyerProfileURIID | ContractFolderStatus - LocatedContractingParty - Party - PartyName - Name | ContractFolderStatus - LocatedContractingParty - ParentLocatedParty - PartyName - Name | ContractFolderStatus - ProcurementProject - Name | ... | ContractFolderStatus - LegalDocumentReference - ID | ContractFolderStatus - LegalDocumentReference - Attachment - ExternalReference - URI | ContractFolderStatus - ProcurementProject - PlannedPeriod - EndDate | ContractFolderStatus - TechnicalDocumentReference - ID | ContractFolderStatus - TechnicalDocumentReference - Attachment - ExternalReference - URI | ContractFolderStatus - LocatedContractingParty - ParentLocatedParty - ParentLocatedParty - PartyName - Name | ContractFolderStatus - TenderingProcess - ParticipationRequestReceptionPeriod - EndDate | ContractFolderStatus - TenderingProcess - ParticipationRequestReceptionPeriod - EndTime | ContractFolderStatus - TenderingProcess - TenderSubmissionDeadlinePeriod | deleted_on | |||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
zip | file name | entry | |||||||||||||||||||||
PlataformasAgregadasSinMenores_202201_05-06.zip | PlataformasAgregadasSinMenores_20220105_030012.atom | 0 | https://contrataciondelestado.es/sindicacion/P... | Id licitación: 2021039438; Órgano de Contratac... | L'objecte és la contractació del servei de bug... | [2022-01-04 12:12:09.464000+00:00] | 2021039438 | [RES] | https://contractaciopublica.gencat.cat/ecofin_... | Ajuntament de Girona | Entitats municipals de Catalunya | L'objecte és la contractació del servei de bug... | ... | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | 2021-12-13 23:59:00+00:00 | NaT |
1 | https://contrataciondelestado.es/sindicacion/P... | Id licitación: 2021_2568; Órgano de Contrataci... | Servei comunicacions postals de l'Ajuntament d... | [2022-01-04 12:12:09.400000+00:00] | 2021_2568 | [ADJ] | https://contractaciopublica.gencat.cat/ecofin_... | Ajuntament de Collbató | Entitats municipals de Catalunya | Servei comunicacions postals de l'Ajuntament d... | ... | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | 2021-11-29 23:59:00+00:00 | NaT |
2 rows × 40 columns
The last column is the new one added
-2:] merged_df.iloc[:,
ContractFolderStatus | deleted_on | |||
---|---|---|---|---|
TenderingProcess | ||||
TenderSubmissionDeadlinePeriod | ||||
zip | file name | entry | ||
PlataformasAgregadasSinMenores_202201_05-06.zip | PlataformasAgregadasSinMenores_20220105_030012.atom | 0 | 2021-12-13 23:59:00+00:00 | NaT |
1 | 2021-11-29 23:59:00+00:00 | NaT | ||
2 | 2021-12-20 14:00:00+00:00 | NaT | ||
3 | 2021-10-22 14:00:00+00:00 | NaT | ||
4 | 2021-11-30 18:00:00+00:00 | NaT | ||
... | ... | ... | ... | |
PlataformasAgregadasSinMenores_20220106_030013.atom | 471 | 2022-01-19 15:00:00+00:00 | NaT | |
472 | 2021-05-07 15:00:00+00:00 | NaT | ||
473 | 2021-06-30 23:59:00+00:00 | NaT | ||
474 | 2021-12-13 13:00:00+00:00 | NaT | ||
475 | 2021-11-10 18:00:00+00:00 | NaT |
951 rows × 2 columns
The new merged pd.DataFrame
has the same number of rows as the original one
assert len(df) == len(merged_df)
How many entries have been deleted?
'deleted_on'].notna().sum() merged_df[
0
Saving to parquet
In order to save a pd.DataFrame
in a parquet file, some conditions must be met:
if some column contains a list, then every element in that columns must also be a list (of a single element if necessary)
the elements of the lists in any column must be of the same type
all of the lists across a given column must have elements of the same type
parquet_amenable
parquet_amenable (df:pandas.core.frame.DataFrame, inplace:bool=False)
**Tunes* a pd.DataFrame
so that it can saved in a parquet file*
Type | Default | Details | |
---|---|---|---|
df | DataFrame | Input DataFrame to be “tuned” for parquet |
|
inplace | bool | False | If True , the input is modified |
Returns | DataFrame | Parquet-ready DataFrame |
= parquet_amenable(df)
parquet_df 2) parquet_df.head(
id | summary | title | updated | ContractFolderStatus | |||||||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
ContractFolderID | ContractFolderStatusCode | LocatedContractingParty | ProcurementProject | LegalDocumentReference | ProcurementProject | TechnicalDocumentReference | LocatedContractingParty | TenderingProcess | |||||||||||||||
BuyerProfileURIID | Party | ParentLocatedParty | Name | ... | PlannedPeriod | ID | Attachment | PlannedPeriod | ID | Attachment | ParentLocatedParty | ParticipationRequestReceptionPeriod | TenderSubmissionDeadlinePeriod | ||||||||||
PartyName | PartyName | ... | StartDate | ExternalReference | EndDate | ExternalReference | ParentLocatedParty | EndDate | EndTime | ||||||||||||||
Name | Name | ... | URI | URI | PartyName | ||||||||||||||||||
... | Name | ||||||||||||||||||||||
zip | file name | entry | |||||||||||||||||||||
PlataformasAgregadasSinMenores_202201_05-06.zip | PlataformasAgregadasSinMenores_20220105_030012.atom | 0 | https://contrataciondelestado.es/sindicacion/P... | Id licitación: 2021039438; Órgano de Contratac... | L'objecte és la contractació del servei de bug... | [2022-01-04 12:12:09.464000+00:00] | 2021039438 | [RES] | https://contractaciopublica.gencat.cat/ecofin_... | Ajuntament de Girona | Entitats municipals de Catalunya | L'objecte és la contractació del servei de bug... | ... | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | 2021-12-13 23:59:00+00:00 |
1 | https://contrataciondelestado.es/sindicacion/P... | Id licitación: 2021_2568; Órgano de Contrataci... | Servei comunicacions postals de l'Ajuntament d... | [2022-01-04 12:12:09.400000+00:00] | 2021_2568 | [ADJ] | https://contractaciopublica.gencat.cat/ecofin_... | Ajuntament de Collbató | Entitats municipals de Catalunya | Servei comunicacions postals de l'Ajuntament d... | ... | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | 2021-11-29 23:59:00+00:00 |
2 rows × 39 columns
Concatenating dataframes
Concatenating pd.DataFrame
s with the same number of levels in the columns is fine
2], df.iloc[2:4]), axis=0).head(2) pd.concat((df.iloc[:
id | summary | title | updated | ContractFolderStatus | |||||||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
ContractFolderID | ContractFolderStatusCode | LocatedContractingParty | ProcurementProject | LegalDocumentReference | ProcurementProject | TechnicalDocumentReference | LocatedContractingParty | TenderingProcess | |||||||||||||||
BuyerProfileURIID | Party | ParentLocatedParty | Name | ... | PlannedPeriod | ID | Attachment | PlannedPeriod | ID | Attachment | ParentLocatedParty | ParticipationRequestReceptionPeriod | TenderSubmissionDeadlinePeriod | ||||||||||
PartyName | PartyName | ... | StartDate | ExternalReference | EndDate | ExternalReference | ParentLocatedParty | EndDate | EndTime | ||||||||||||||
Name | Name | ... | URI | URI | PartyName | ||||||||||||||||||
... | Name | ||||||||||||||||||||||
zip | file name | entry | |||||||||||||||||||||
PlataformasAgregadasSinMenores_202201_05-06.zip | PlataformasAgregadasSinMenores_20220105_030012.atom | 0 | https://contrataciondelestado.es/sindicacion/P... | Id licitación: 2021039438; Órgano de Contratac... | L'objecte és la contractació del servei de bug... | [2022-01-04 12:12:09.464000+00:00] | 2021039438 | [RES] | https://contractaciopublica.gencat.cat/ecofin_... | Ajuntament de Girona | Entitats municipals de Catalunya | L'objecte és la contractació del servei de bug... | ... | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | 2021-12-13 23:59:00+00:00 |
1 | https://contrataciondelestado.es/sindicacion/P... | Id licitación: 2021_2568; Órgano de Contrataci... | Servei comunicacions postals de l'Ajuntament d... | [2022-01-04 12:12:09.400000+00:00] | 2021_2568 | [ADJ] | https://contractaciopublica.gencat.cat/ecofin_... | Ajuntament de Collbató | Entitats municipals de Catalunya | Servei comunicacions postals de l'Ajuntament d... | ... | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | 2021-11-29 23:59:00+00:00 |
2 rows × 39 columns
Let us build a dummy pd.DataFrame
with multiindexes for both rows and columns
= pd.DataFrame(
dummy_df 2, 4),
np.random.rand(=pd.MultiIndex.from_product([['a', 'b'], range(1, 3)]),
columns=pd.MultiIndex.from_tuples([('file.zip', 'foo.atom', 1), ('file.zip', 'fee.atom', 2)], names=['zip', 'file name', 'entry']))
index dummy_df
a | b | |||||
---|---|---|---|---|---|---|
1 | 2 | 1 | 2 | |||
zip | file name | entry | ||||
file.zip | foo.atom | 1 | 0.640319 | 0.345485 | 0.318882 | 0.357589 |
fee.atom | 2 | 0.054101 | 0.484075 | 0.908403 | 0.283865 |
This new dummy pd.DataFrame
and the one above cannot be stacked together as they are because they have different number of levels in the columns
try:
=0)
pd.concat((df, dummy_df), axisexcept ValueError:
print(f'{df.columns.nlevels=}, {dummy_df.columns.nlevels=}')
df.columns.nlevels=6, dummy_df.columns.nlevels=2
The resulting pd.DataFrame
must have the larger number of levels
= max(df.columns.nlevels, dummy_df.columns.nlevels)
res_n_levels res_n_levels
6
Then we need to complement the names of the columns for the shallower pd.DataFrame
= []
new_names
for c in dummy_df.columns:
+ ('',) * (res_n_levels - dummy_df.columns.nlevels))
new_names.append(c
print(new_names)
[('a', 1, '', '', '', ''), ('a', 2, '', '', '', ''), ('b', 1, '', '', '', ''), ('b', 2, '', '', '', '')]
A new padded pd.DataFrame
is built out of the dummy
= dummy_df.copy()
padded_dummy_df = pd.MultiIndex.from_tuples(new_names)
padded_dummy_df.columns padded_dummy_df.columns.nlevels
6
Now it’s fine to concatenate both pd.DataFrame
s
=0).tail(3) pd.concat((df, padded_dummy_df), axis
id | summary | title | updated | ContractFolderStatus | a | b | |||||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
ContractFolderID | ContractFolderStatusCode | LocatedContractingParty | ProcurementProject | ... | TechnicalDocumentReference | LocatedContractingParty | TenderingProcess | 1 | 2 | 1 | 2 | ||||||||||||
BuyerProfileURIID | Party | ParentLocatedParty | Name | ... | ID | Attachment | ParentLocatedParty | ParticipationRequestReceptionPeriod | TenderSubmissionDeadlinePeriod | ||||||||||||||
PartyName | PartyName | ... | ExternalReference | ParentLocatedParty | EndDate | EndTime | |||||||||||||||||
Name | Name | ... | URI | PartyName | |||||||||||||||||||
... | Name | ||||||||||||||||||||||
zip | file name | entry | |||||||||||||||||||||
PlataformasAgregadasSinMenores_202201_05-06.zip | PlataformasAgregadasSinMenores_20220106_030013.atom | 475 | https://contrataciondelestado.es/sindicacion/P... | Id licitación: CONTR/2021/000000137; Órgano de... | Servei de redacció del Projecte Executiu d'obr... | [2022-01-04 12:12:09.536000+00:00] | CONTR/2021/000000137 | [ADJ] | https://contractaciopublica.gencat.cat/ecofin_... | Institut d'Assistència Sanitària (IAS) | Departaments i Sector Públic de la Generalitat | Servei de redacció del Projecte Executiu d'obr... | ... | <NA> | <NA> | <NA> | <NA> | <NA> | 2021-11-10 18:00:00+00:00 | NaN | NaN | NaN | NaN |
file.zip | foo.atom | 1 | <NA> | <NA> | <NA> | NaN | <NA> | NaN | <NA> | <NA> | <NA> | <NA> | ... | <NA> | <NA> | <NA> | <NA> | <NA> | NaT | 0.640319 | 0.345485 | 0.318882 | 0.357589 |
fee.atom | 2 | <NA> | <NA> | <NA> | NaN | <NA> | NaN | <NA> | <NA> | <NA> | <NA> | ... | <NA> | <NA> | <NA> | <NA> | <NA> | NaT | 0.054101 | 0.484075 | 0.908403 | 0.283865 |
3 rows × 43 columns
The above steps are summarized in a new function.
stack
stack (top_df:pandas.core.frame.DataFrame, bottom_df:pandas.core.frame.DataFrame)
Stacks one pd.DataFrame
on top of another
Type | Details | |
---|---|---|
top_df | DataFrame | Top |
bottom_df | DataFrame | Bottom |
Returns | DataFrame | Stacked pd.DataFrames |
Above df
and the dummy one are stacked together
3) stack(df, dummy_df).tail(
id | summary | title | updated | ContractFolderStatus | a | b | |||||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
ContractFolderID | ContractFolderStatusCode | LocatedContractingParty | ProcurementProject | ... | TechnicalDocumentReference | LocatedContractingParty | TenderingProcess | 1 | 2 | 1 | 2 | ||||||||||||
BuyerProfileURIID | Party | ParentLocatedParty | Name | ... | ID | Attachment | ParentLocatedParty | ParticipationRequestReceptionPeriod | TenderSubmissionDeadlinePeriod | ||||||||||||||
PartyName | PartyName | ... | ExternalReference | ParentLocatedParty | EndDate | EndTime | |||||||||||||||||
Name | Name | ... | URI | PartyName | |||||||||||||||||||
... | Name | ||||||||||||||||||||||
zip | file name | entry | |||||||||||||||||||||
PlataformasAgregadasSinMenores_202201_05-06.zip | PlataformasAgregadasSinMenores_20220106_030013.atom | 475 | https://contrataciondelestado.es/sindicacion/P... | Id licitación: CONTR/2021/000000137; Órgano de... | Servei de redacció del Projecte Executiu d'obr... | [2022-01-04 12:12:09.536000+00:00] | CONTR/2021/000000137 | [ADJ] | https://contractaciopublica.gencat.cat/ecofin_... | Institut d'Assistència Sanitària (IAS) | Departaments i Sector Públic de la Generalitat | Servei de redacció del Projecte Executiu d'obr... | ... | <NA> | <NA> | <NA> | <NA> | <NA> | 2021-11-10 18:00:00+00:00 | NaN | NaN | NaN | NaN |
file.zip | foo.atom | 1 | <NA> | <NA> | <NA> | NaN | <NA> | NaN | <NA> | <NA> | <NA> | <NA> | ... | <NA> | <NA> | <NA> | <NA> | <NA> | NaT | 0.640319 | 0.345485 | 0.318882 | 0.357589 |
fee.atom | 2 | <NA> | <NA> | <NA> | NaN | <NA> | NaN | <NA> | <NA> | <NA> | <NA> | ... | <NA> | <NA> | <NA> | <NA> | <NA> | NaT | 0.054101 | 0.484075 | 0.908403 | 0.283865 |
3 rows × 43 columns
…in a different order
3) stack(dummy_df, df).head(
a | b | id | summary | title | updated | ContractFolderStatus | |||||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | 2 | 1 | 2 | ContractFolderID | ContractFolderStatusCode | ... | ProcurementProject | LegalDocumentReference | ProcurementProject | TechnicalDocumentReference | LocatedContractingParty | TenderingProcess | |||||||||||
... | PlannedPeriod | ID | Attachment | PlannedPeriod | ID | Attachment | ParentLocatedParty | ParticipationRequestReceptionPeriod | TenderSubmissionDeadlinePeriod | ||||||||||||||
... | StartDate | ExternalReference | EndDate | ExternalReference | ParentLocatedParty | EndDate | EndTime | ||||||||||||||||
... | URI | URI | PartyName | ||||||||||||||||||||
... | Name | ||||||||||||||||||||||
zip | file name | entry | |||||||||||||||||||||
file.zip | foo.atom | 1 | 0.640319 | 0.345485 | 0.318882 | 0.357589 | <NA> | <NA> | <NA> | NaN | <NA> | NaN | ... | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | NaT |
fee.atom | 2 | 0.054101 | 0.484075 | 0.908403 | 0.283865 | <NA> | <NA> | <NA> | NaN | <NA> | NaN | ... | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | NaT | |
PlataformasAgregadasSinMenores_202201_05-06.zip | PlataformasAgregadasSinMenores_20220105_030012.atom | 0 | NaN | NaN | NaN | NaN | https://contrataciondelestado.es/sindicacion/P... | Id licitación: 2021039438; Órgano de Contratac... | L'objecte és la contractació del servei de bug... | [2022-01-04 12:12:09.464000+00:00] | 2021039438 | [RES] | ... | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | 2021-12-13 23:59:00+00:00 |
3 rows × 43 columns
Processing a zip
A convenience function to streamline a common procedure to process data in zip files
distilled_data_from_zip
distilled_data_from_zip (zip_file:pathlib.Path|str)
Reads a zip file and extracts the data and deleted entries
Type | Details | |
---|---|---|
zip_file | pathlib.Path | str | Input file |
Returns | tuple | Data, deleted entries |
= distilled_data_from_zip(input_file) distilled_df, distilled_series
The bulk of the data
2) distilled_df.head(
id | summary | title | ContractFolderStatus | updated | ContractFolderStatus | ||||||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
ContractFolderID | LocatedContractingParty | ProcurementProject | ... | LegalDocumentReference | ProcurementProject | TechnicalDocumentReference | LocatedContractingParty | TenderingProcess | ContractFolderStatusCode | ||||||||||||||
BuyerProfileURIID | Party | ParentLocatedParty | Name | TypeCode | BudgetAmount | ... | Attachment | PlannedPeriod | ID | Attachment | ParentLocatedParty | ParticipationRequestReceptionPeriod | TenderSubmissionDeadlinePeriod | ||||||||||
PartyName | PartyName | EstimatedOverallContractAmount | ... | ExternalReference | EndDate | ExternalReference | ParentLocatedParty | EndDate | EndTime | ||||||||||||||
Name | Name | ... | URI | URI | PartyName | ||||||||||||||||||
... | Name | ||||||||||||||||||||||
zip | file name | entry | |||||||||||||||||||||
PlataformasAgregadasSinMenores_202201_05-06.zip | PlataformasAgregadasSinMenores_20220105_030012.atom | 474 | https://contrataciondelestado.es/sindicacion/P... | Id licitación: 15/2021; Ó“rgano de Contratació... | “Reforestación de 8,8 HA en el paraje “MONTAÑO... | 15/2021 | <NA> | Junta de Gobierno Local | Ayuntamiento de Zierbena | “Reforestación de 8,8 HA en el paraje “MONTAÑO... | 3.0 | 51345.73 | ... | https://www.contratacion.euskadi.eus/ac70cPubl... | <NA> | Obra repoblación Montaño - MUP 66 Publicado.pdf | https://www.contratacion.euskadi.eus/ac70cPubl... | Ayuntamiento de Zierbena | <NA> | <NA> | 2021-12-07 13:00:00+00:00 | [2022-01-03 12:56:20.849000+00:00] | [EV] |
473 | https://contrataciondelestado.es/sindicacion/P... | Id licitación: 2021/CO_ASER/0120; Ó“rgano de C... | Formación a personas emprendedoras | 2021/CO_ASER/0120 | <NA> | Concejala-Delegada del Departamento de Promoci... | Ayuntamiento de Vitoria-Gasteiz | Formación a personas emprendedoras | 2.0 | 34000.00 | ... | https://www.contratacion.euskadi.eus/ac70cPubl... | <NA> | ASER0120_Pliego_Prescripciones_Tecnicas_Partic... | https://www.contratacion.euskadi.eus/ac70cPubl... | Ayuntamiento de Vitoria-Gasteiz | <NA> | <NA> | 2021-11-26 13:00:00+00:00 | [2022-01-03 12:56:20.897000+00:00] | [EV] |
2 rows × 39 columns
The Series
with the deleted data
2) distilled_series.head(
zip file name id
PlataformasAgregadasSinMenores_202201_05-06.zip PlataformasAgregadasSinMenores_20220105_030012.atom https://contrataciondelestado.es/sindicacion/PlataformasAgregadasSinMenores/6724977 2022-01-04 00:12:01.376000+00:00
https://contrataciondelestado.es/sindicacion/PlataformasAgregadasSinMenores/1968315 2022-01-03 23:11:57.567000+00:00
Name: deleted_on, dtype: datetime64[ns, UTC]
input_file
PosixPath('/home/manu/Sync/UC3M/proyectos/2022/nextProcurement/sproc/samples/PlataformasAgregadasSinMenores_202201_05-06.zip')
# another_input_file = pathlib.Path.cwd().parent / 'data' / 'contratosMenoresPerfilesContratantes_2018.zip'
# assert another_input_file.exists()
# print(another_input_file)
# another_distilled_df, another_distilled_series = distilled_data_from_zip(another_input_file)
# another_distilled_df.head()
# merge_deleted(another_distilled_df, another_distilled_series)
# res = parquet_amenable(res)
Completeness
It computes the filling ratio of every identified administration for every column. Non-identified administration will be gathered together in a NaN
domain.
sparsity
sparsity (df:pandas.core.frame.DataFrame, tidy_up:bool=False, do_not_modify_input:bool=True)
Ratio of completeness for every (identified) administration
Type | Default | Details | |
---|---|---|---|
df | DataFrame | Input | |
tidy_up | bool | False | If True columns are ordered |
do_not_modify_input | bool | True | If True no domain column is added to the input |
Returns | DataFrame | Output |
sparsity(df)
id | summary | title | updated | ContractFolderStatus | |||||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
ContractFolderID | ContractFolderStatusCode | LocatedContractingParty | ProcurementProject | LegalDocumentReference | ProcurementProject | TechnicalDocumentReference | LocatedContractingParty | TenderingProcess | |||||||||||||
BuyerProfileURIID | Party | ParentLocatedParty | Name | ... | PlannedPeriod | ID | Attachment | PlannedPeriod | ID | Attachment | ParentLocatedParty | ParticipationRequestReceptionPeriod | TenderSubmissionDeadlinePeriod | ||||||||
PartyName | PartyName | ... | StartDate | ExternalReference | EndDate | ExternalReference | ParentLocatedParty | EndDate | EndTime | ||||||||||||
Name | Name | ... | URI | URI | PartyName | ||||||||||||||||
... | Name | ||||||||||||||||||||
domain | |||||||||||||||||||||
contractaciopublica.gencat.cat | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 1.000000 | 1.0 | ... | 0.121622 | 0.353604 | 0.353604 | 0.121622 | 0.317568 | 0.317568 | 0.000000 | 0.000000 | 0.000000 | 0.934685 |
www.bilbao.eus | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 0.0 | 1.0 | 1.000000 | 1.0 | ... | 0.000000 | 1.000000 | 1.000000 | 0.000000 | 0.666667 | 0.666667 | 0.000000 | 1.000000 | 1.000000 | 1.000000 |
www.contratacion.euskadi.eus | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 0.0 | 1.0 | 1.000000 | 1.0 | ... | 0.062500 | 1.000000 | 1.000000 | 0.062500 | 0.906250 | 0.906250 | 1.000000 | 0.000000 | 0.000000 | 0.979167 |
www.contratosdegalicia.gal | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 0.000000 | 1.0 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
www.larioja.org | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 1.000000 | 1.0 | ... | 0.750000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.500000 | 0.000000 | 0.000000 | 1.000000 |
www.madrid.org | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 0.747826 | 1.0 | ... | 0.000000 | 1.000000 | 1.000000 | 0.000000 | 0.947826 | 0.947826 | 0.400000 | 0.000000 | 0.000000 | 0.252174 |
NaN | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 0.0 | 1.0 | 1.000000 | 1.0 | ... | 0.034483 | 0.000000 | 0.000000 | 0.034483 | 0.000000 | 0.000000 | 0.006897 | 0.055172 | 0.055172 | 0.910345 |
7 rows × 39 columns
The tidied-up version
=True) sparsity(df, tidy_up
domain | contractaciopublica.gencat.cat | www.bilbao.eus | www.contratacion.euskadi.eus | www.contratosdegalicia.gal | www.larioja.org | www.madrid.org | NaN | |||||
---|---|---|---|---|---|---|---|---|---|---|---|---|
ContractFolderStatus | ContractFolderID | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000 | 1.000000 | 1.000000 | ||||
ContractFolderStatusCode | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000 | 1.000000 | 1.000000 | |||||
LegalDocumentReference | Attachment | ExternalReference | URI | 0.353604 | 1.000000 | 1.000000 | 0.000000 | 0.000 | 1.000000 | 0.000000 | ||
ID | 0.353604 | 1.000000 | 1.000000 | 0.000000 | 0.000 | 1.000000 | 0.000000 | |||||
LocatedContractingParty | BuyerProfileURIID | 1.000000 | 0.000000 | 0.000000 | 1.000000 | 1.000 | 1.000000 | 0.000000 | ||||
ParentLocatedParty | ParentLocatedParty | PartyName | Name | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 0.500 | 0.400000 | 0.006897 | ||
PartyName | Name | 1.000000 | 1.000000 | 1.000000 | 0.000000 | 1.000 | 0.747826 | 1.000000 | ||||
Party | PartyIdentification | ID | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 0.000 | 0.000000 | 0.000000 | |||
PartyName | Name | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000 | 1.000000 | 1.000000 | ||||
ProcurementProject | BudgetAmount | EstimatedOverallContractAmount | 1.000000 | 1.000000 | 0.979167 | 0.000000 | 1.000 | 1.000000 | 1.000000 | |||
TaxExclusiveAmount | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000 | 1.000000 | 1.000000 | |||||
Name | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000 | 1.000000 | 1.000000 | |||||
PlannedPeriod | DurationMeasure | 0.878378 | 1.000000 | 0.937500 | 0.590909 | 1.000 | 1.000000 | 0.862069 | ||||
EndDate | 0.121622 | 0.000000 | 0.062500 | 0.000000 | 0.000 | 0.000000 | 0.034483 | |||||
StartDate | 0.121622 | 0.000000 | 0.062500 | 0.000000 | 0.750 | 0.000000 | 0.034483 | |||||
RealizedLocation | CountrySubentityCode | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000 | 1.000000 | 1.000000 | ||||
RequiredCommodityClassification | ItemClassificationCode | 0.997748 | 1.000000 | 1.000000 | 1.000000 | 1.000 | 1.000000 | 1.000000 | ||||
TypeCode | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000 | 1.000000 | 1.000000 | |||||
TechnicalDocumentReference | Attachment | ExternalReference | URI | 0.317568 | 0.666667 | 0.906250 | 0.000000 | 0.000 | 0.947826 | 0.000000 | ||
ID | 0.317568 | 0.666667 | 0.906250 | 0.000000 | 0.000 | 0.947826 | 0.000000 | |||||
TenderResult | AwardedTenderedProject | LegalMonetaryTotal | TaxExclusiveAmount | 0.621622 | 0.000000 | 0.385417 | 0.590909 | 0.750 | 0.495652 | 0.524138 | ||
ProcurementProjectLotID | 0.103604 | 0.000000 | 0.000000 | 0.272727 | 0.125 | 0.060870 | 0.213793 | |||||
ReceivedTenderQuantity | 0.644144 | 0.000000 | 0.406250 | 0.590909 | 0.750 | 0.495652 | 0.537931 | |||||
ResultCode | 0.644144 | 0.000000 | 0.406250 | 0.590909 | 0.750 | 0.495652 | 0.537931 | |||||
WinningParty | PartyIdentification | ID | 0.621622 | 0.000000 | 0.385417 | 0.545455 | 0.750 | 0.495652 | 0.524138 | |||
PartyName | Name | 0.621622 | 0.000000 | 0.385417 | 0.545455 | 0.750 | 0.495652 | 0.524138 | ||||
TenderingProcess | ParticipationRequestReceptionPeriod | EndDate | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000 | 0.000000 | 0.055172 | |||
EndTime | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000 | 0.000000 | 0.055172 | |||||
ProcedureCode | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000 | 1.000000 | 1.000000 | |||||
TenderSubmissionDeadlinePeriod | 0.934685 | 1.000000 | 0.979167 | 1.000000 | 1.000 | 0.252174 | 0.910345 | |||||
EndDate | 0.934685 | 1.000000 | 0.979167 | 1.000000 | 1.000 | 0.252174 | 0.910345 | |||||
EndTime | 0.934685 | 1.000000 | 0.979167 | 1.000000 | 1.000 | 0.252174 | 0.910345 | |||||
ValidNoticeInfo | AdditionalPublicationStatus | AdditionalPublicationDocumentReference | IssueDate | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000 | 1.000000 | 1.000000 | ||
PublicationMediaName | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000 | 1.000000 | 1.000000 | |||||
NoticeTypeCode | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000 | 1.000000 | 1.000000 | |||||
id | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000 | 1.000000 | 1.000000 | |||||
summary | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000 | 1.000000 | 1.000000 | |||||
title | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000 | 1.000000 | 1.000000 | |||||
updated | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000 | 1.000000 | 1.000000 |
If we set to False
the do_not_modify_input
parameter, then a new domain
column is added to the input pd.DataFrame
. By default, it’s not there.
'domain' in df.columns
False
=False)
sparsity(df, do_not_modify_input'domain' in df.columns
True