How to add zip files into Pandas Dataframe

4 minute read

Hello everyone, today I am interested to show an interesting trick to include a zip file into a column pandas dataframe.

Sometimes when you are creating a unstructured database where you require include photos, videos, word documents, excel files or simple binary files.

There are plenty of methods to do this. The method that I will persue is zip all the files that you want to storage then convert into pandas dataframe.

Getting files of a library

Let us assume that we want to insert some python libraries into a zip file. For example I want to get the findspark application

# Import the os module
import os
# Path
home = os.getcwd()
# Print the current working directory
print("Current working directory: {0}".format(home))
Current working directory: /home/wsuser/work
!mkdir folder
# Join various path components
target=os.path.join(home, "folder")
print(target)
# Change the current working directory
os.chdir(target)
# Print the current working directory
print("Current working directory: {0}".format(os.getcwd()))
/home/wsuser/work/folder
Current working directory: /home/wsuser/work/folder

Here we put the files that we want to convert to dataframe

!pip download findspark 
Collecting findspark
  Downloading findspark-2.0.1-py2.py3-none-any.whl (4.4 kB)
Saved ./findspark-2.0.1-py2.py3-none-any.whl
Successfully downloaded findspark

Printing the files to be inserted into the dataframe

!dir
findspark-2.0.1-py2.py3-none-any.whl
# Change the current working directory
os.chdir(home)

We are interested to zip the folder by using python

Zip files python

Here we want to create a program that creates a zip file of all the files contained in a certain folder.

#importing the os module
import glob
import os
#to get the current working directory
current_dir=os.getcwd()
# Join various path components
folder_to_zip=os.path.join(current_dir, "folder")
print(folder_to_zip)
/home/wsuser/work/folder
def get_list_of_files(directory):
    # Path
    path = directory+"/*"
    print("Looking files in the path:",directory)
    files_to_zip=glob.glob(path)
    return  files_to_zip
from zipfile import ZipFile
from io import BytesIO
def create_zip(folder_to_zip):
    """
    returns: zip archive
    """    
    files_to_zip=get_list_of_files(folder_to_zip)
    archive = BytesIO()
    with ZipFile(archive, 'w') as zip_archive:
        for file in files_to_zip:
            print(file)
            name=file[len(folder_to_zip)+1:]
            print(name)
            # Create n files on zip archive
            with zip_archive.open(name, 'w') as files:
                with open(file, 'rb') as file_data:
                    bytes_content = file_data.read()
                    files.write(bytes_content)
    return archive
archive = create_zip(folder_to_zip)
Looking files in the path: /home/wsuser/work/folder
/home/wsuser/work/folder/findspark-2.0.1-py2.py3-none-any.whl
findspark-2.0.1-py2.py3-none-any.whl
# Flush archive stream to a file on disk
with open('data.zip', 'wb') as f:
    f.write(archive.getbuffer())
type(archive)
_io.BytesIO
archive.close()
if os.name == 'nt':
    print('I am Windows')
    !dir *.zip
else:
    print('I am on Unix')
    !ls *.zip -ltr
I am on Unix
-rw-rw---- 1 wsuser wscommon 4616 Feb 26 19:40 data.zip

We have the file zip that we want to convert insert into the dataframe

from zipfile import ZipFile
with ZipFile('data.zip') as zip_archive:
    for item in zip_archive.filelist:
        print(item)
    print(f'\nThere are {len(zip_archive.filelist)} ZipInfo objects present in archive')
<ZipInfo filename='findspark-2.0.1-py2.py3-none-any.whl' filemode='?rw-------' file_size=4446>

There are 1 ZipInfo objects present in archive

Read all files in .zip archive in python

archive = ZipFile('data.zip', 'r')
files = archive.namelist()
files
['findspark-2.0.1-py2.py3-none-any.whl']
type(archive)
zipfile.ZipFile
import zipfile
z = zipfile.ZipFile("data.zip", "r")
for filename in z.namelist(  ):
    print('File:', filename),
    byt = z.read(filename)
    print(type(byt))
    print ('has',len(byt),'bytes')
File: findspark-2.0.1-py2.py3-none-any.whl
<class 'bytes'>
has 4446 bytes
print(byt[:10])
type(byt)
b'PK\x03\x04\x14\x00\x00\x00\x08\x00'
bytes
import base64
base64_encoded_data = base64.b64encode(byt)
print(base64_encoded_data[:10])
type(base64_encoded_data)
b'UEsDBBQAAA'

bytes
base64_message = base64_encoded_data.decode('utf-8')
print(base64_message[:10])
type(base64_message)
UEsDBBQAAA

str

Line 3: We encode string, cast to byte object.

Line 5: We use the decode() method with utf8 encoding scheme to transform from encoded values to a string object.

Line 7: We print decoded values

# String of encoded codes
# For word EDPRESSO
bytes= b'\x45\x44\x50\x52\x45\x53\x53\x4f'
# Using encoding scheme: UTF8
bytes= bytes.decode('utf8')
# Show results
print ("Decoded bytes: " + bytes)
Decoded bytes: EDPRESSO

There different ways to encode the zip file

Converting zip files to bytes and encode

import base64
with open("data.zip", 'rb') as f:
    data = f.read()
    print(type(data))
    #print(data)
    encoded = base64.b64encode(data)
<class 'bytes'>

Let us print the first 10 characters of our zip data

print('Undecoded zip data: ',data[:10])
Undecoded zip data:  b'PK\x03\x04\x14\x00\x00\x00\x00\x00'
print('Encoded zip data: ',encoded[:10])
Encoded zip data:  b'UEsDBBQAAA'
type(data)
bytes
type(encoded)
bytes

Converting bytes to string - from encoding

# Program for converting bytes to string using decode()
data =encoded
 
# display input
print('\nInput:')
print(data[:10])
print(type(data))
 
# converting
output = str(data, 'UTF-8')
 
# display output
print('\nOutput:')
print(output[:10])
print(type(output))
Input:
b'UEsDBBQAAA'
<class 'bytes'>

Output:
UEsDBBQAAA
<class 'str'>
import pandas as pd
dic = {'encoded' : [encoded]}
df = pd.DataFrame(data=dic)
x = df['encoded'].str.decode("utf-8")
#df['decoded']=x
print(x)
0    UEsDBBQAAAAAAAAAIQAZwvYbXhEAAF4RAAAkAAAAZmluZH...
Name: encoded, dtype: object
df.dtypes
encoded    object
dtype: object
df.head()
encoded
0 b'UEsDBBQAAAAAAAAAIQAZwvYbXhEAAF4RAAAkAAAAZmlu...
df.memory_usage(deep=True)
Index       128
encoded    6197
dtype: int64
df.info(memory_usage='deep')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1 entries, 0 to 0
Data columns (total 1 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   encoded  1 non-null      object
dtypes: object(1)
memory usage: 6.2 KB
df.to_csv('df_zip.csv')  

Reading zipped dataframe

df_new=pd.read_csv('df_zip.csv')  
df_new.dtypes
Unnamed: 0     int64
encoded       object
dtype: object
df_new.memory_usage(deep=True)
Index          128
Unnamed: 0       8
encoded       6216
dtype: int64
df_new.head()
Unnamed: 0 encoded
0 0 b'UEsDBBQAAAAAAAAAIQAZwvYbXhEAAF4RAAAkAAAAZmlu...

Unzip zipped dataframe

Encoded bytes to zip file

# String of encoded codes
string_bytes=df_new['encoded'][0]
type(string_bytes)
str
size=len(string_bytes)
string=string_bytes[2:size]
string[:10]
'UEsDBBQAAA'

We compare with the original

string_bytes_original=df['encoded'].str.decode("utf-8")[0]
type(str(string_bytes_original))
str
string_bytes_original[:10]
'UEsDBBQAAA'

We decode our encoded string

byte_decoded=base64.b64decode(string)
bin_data=byte_decoded #Whatever binary data you have store in a variable
binary_file_path = 'new_file.zip' #Name for new zip file you want to regenerate
with open(binary_file_path, 'wb') as f:
    f.write(bin_data)

checking our new_file.zip

from zipfile import ZipFile
with ZipFile('new_file.zip') as zip_archive:
  for item in zip_archive.filelist:
    print(item)
  print(f'\nThere are {len(zip_archive.filelist)} ZipInfo objects present in archive')
<ZipInfo filename='findspark-2.0.1-py2.py3-none-any.whl' filemode='?rw-------' file_size=4446>

There are 1 ZipInfo objects present in archive
if os.name == 'nt':
    print('I am Windows')
    !dir
else:
    print('I am on Unix')
    !ls -ltrh
I am on Unix
total 28K
drwxrwx--- 2 wsuser wscommon 4.0K Feb 26 19:40 folder
-rw-rw---- 1 wsuser wscommon 4.6K Feb 26 19:40 data.zip
-rw-rw---- 1 wsuser wscommon 6.1K Feb 26 19:40 df_zip.csv
-rw-rw---- 1 wsuser wscommon 4.6K Feb 26 19:40 new_file.zip

You can download this notebook here.

Congratulations! We have practice how to add zip files into a Dataframe as a field.

Posted:

Leave a comment