File size: 982 Bytes
6229e10
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
import json
import jsonlines
from multiprocessing import Pool
from tqdm import tqdm

def load_json(path):
	with open(path,"r") as f:
		return json.load(f)

def dump_json(x, path):
	with open(path,"w") as f:
		json.dump(x,f,indent=4)

def apply_func(x, func):
  pool = Pool(8)
  for inval,outval in tqdm(pool.imap_unordered(func,x),total=len(x)):
    yield inval,outval

def load_jsonl(path,max_items=None):
	with jsonlines.open(path) as reader:
		for ii,item in enumerate(reader):
			yield item
			if max_items is not None and ii >= max_items:
				break

def dump_jsonl(data, path):
	assert isinstance(data, list)
	with jsonlines.open(path, mode="w") as wr:
		for item in tqdm(data,leave=False):
			wr.write(item)

class dump_jsonl_multistage:
	def __init__(self, path, mode="a"):
		self.wr = jsonlines.open(path, mode=mode, flush=True)
	def add(self, item):
		self.wr.write(item)
	def extend(self, items):
		for item in items:
			self.add(item)
	def close(self):
		self.wr.close()