Given a path of the form p = "/path/to/file.txt", how do compute the output path "/output/path/file.json"?

In [3]:
import os.path

p = "/path/to/file.txt"

parent, name = os.path.split(p)
name

'file.txt'

In [14]:
new_name = os.path.splitext(name)[0] + '.json'

'file.json'

In [16]:
os.path.join('output', 'path', new_name)

'output/path/file.json'

In [13]:
from pathlib import Path

Path('output/path/') / Path(p).with_suffix('.json').name

PosixPath('output/path/file.json')

### Threading

In [22]:
import threading

def printer(num):
    print(num)

for i in range(5):
    t = threading.Thread(target=printer, args=(i,))
    t.start()

0
1
2
3
4


In [28]:
my_lock = threading.Lock()

def printer(num):
    with my_lock:
        print(num)

for i in range(5):
    t = threading.Thread(target=printer, args=(i,))
    t.start()

0
1
2
3
4


In [31]:
import concurrent.futures

my_lock = threading.Lock()
def printer(num):
    with my_lock:
        print(num)
    return num ** 2

with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
    results = executor.map(printer, range(10))

0
1
2
6
7
5
8
9
4
3


In [32]:
list(results)

[0, 1, 4, 9, 16, 25, 36, 49, 64, 81]

In [34]:
import concurrent.futures

my_lock = threading.Lock()
def printer(num1, num2):
    with my_lock:
        print(num1, num2)
    return num1

with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
    results = executor.map(printer, range(5), range(4,-1,-1))

0 4
1 3
2 2
3 1
4 0


In [35]:
list(results)

[0, 1, 2, 3, 4]

### Multiprocessing

In [None]:
# !!! will not run in the notebook !!!

# import multiprocessing

# def printer(num):
#     print(num)
    
# with multiprocessing.Pool() as pool:
#     pool.map(printer, range(5))

In [38]:
%%writefile test-multiprocessing.py
import multiprocessing

def square(num):
    return num * num
    
if __name__ == '__main__':
    __spec__ = None    
    with multiprocessing.Pool(5) as pool:
        result = pool.map(square, range(5))
        print(result)

Writing test-multiprocessing.py


In [39]:
%run test-multiprocessing.py

[0, 1, 4, 9, 16]


In [40]:
import concurrent.futures
import multiprocessing as mp
import time

def dummy(num):
    time.sleep(5)
    return num ** 2

start_time = time.time()
with concurrent.futures.ProcessPoolExecutor(max_workers=5, mp_context=mp.get_context('fork')) as executor:
    results = executor.map(dummy, range(10))
print("Total Time:", time.time() - start_time)

Total Time: 10.049713134765625


In [41]:
for r in results:
    print(r)

0
1
4
9
16
25
36
49
64
81


### Comparing single-thread, multi-thread, and asyncio

[Example by J. Anderson](https://realpython.com/python-concurrency/#how-to-speed-up-an-io-bound-program)

In [42]:
# https://realpython.com/python-concurrency/#how-to-speed-up-an-io-bound-program

import requests
import time

def download_site(url, session):
    with session.get(url) as response:
        # print(f"Read {len(response.content)} from {url}")
        pass

def download_all_sites(sites):
    with requests.Session() as session:
        for url in sites:
            download_site(url, session)

if __name__ == "__main__":
    sites = [
        "https://www.jython.org",
        "http://olympus.realpython.org/dice",
    ] * 80
    start_time = time.time()
    download_all_sites(sites)
    duration = time.time() - start_time
    print(f"Downloaded {len(sites)} in {duration} seconds")

Downloaded 160 in 5.099061965942383 seconds


In [43]:
# https://realpython.com/python-concurrency/#how-to-speed-up-an-io-bound-program

import concurrent.futures
import requests
import threading
import time

thread_local = threading.local()

def get_session():
    if not hasattr(thread_local, "session"):
        thread_local.session = requests.Session()
    return thread_local.session

def download_site(url):
    session = get_session()
    with session.get(url) as response:
        # print(f"Read {len(response.content)} from {url}")
        pass

def download_all_sites(sites):
    with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
        executor.map(download_site, sites)

if __name__ == "__main__":
    sites = [
        "https://www.jython.org",
        "http://olympus.realpython.org/dice",
    ] * 80
    start_time = time.time()
    download_all_sites(sites)
    duration = time.time() - start_time
    print(f"Downloaded {len(sites)} in {duration} seconds")

Downloaded 160 in 1.2380750179290771 seconds


In [44]:
# !pip install nest_asyncio
import nest_asyncio

nest_asyncio.apply()

In [45]:
# https://realpython.com/python-concurrency/#how-to-speed-up-an-io-bound-program

import asyncio
import time
import aiohttp

async def download_site(session, url):
    async with session.get(url) as response:
        # print("Read {0} from {1}".format(response.content_length, url))
        pass

async def download_all_sites(sites):
    async with aiohttp.ClientSession() as session:
        tasks = []
        for url in sites:
            task = asyncio.ensure_future(download_site(session, url))
            tasks.append(task)
        await asyncio.gather(*tasks, return_exceptions=True)

if __name__ == "__main__":
    sites = [
        "https://www.jython.org",
        "http://olympus.realpython.org/dice",
    ] * 80
    start_time = time.time()
    asyncio.get_event_loop().run_until_complete(download_all_sites(sites))
    duration = time.time() - start_time
    print(f"Downloaded {len(sites)} sites in {duration} seconds")

Downloaded 160 sites in 0.2504308223724365 seconds


In [46]:
# this is not a function call, creates a coroutine object!
download_site("foo", "bar")

<coroutine object download_site at 0x106f99030>