import os import numpy as np import pandas as pd import string, random def random_string(n: int): return ''.join(random.choices(string.ascii_letters, weights=(1,) * len(string.ascii_letters), k=n)) rows = 3 # df = pd.DataFrame(np.array([range(5), range(5)])) for i in range(4): data = {'id': range(rows), 'name': (random_string(5) for _ in range(rows)), 'age': (random.randrange(10, 99) for _ in range(rows)), 'score': (round(random.uniform(0, 100), 2) for _ in range(rows)), 'group': i} # df = pd.DataFrame(data=data, index=range(rows)) df = pd.DataFrame(data=data, index=tuple(''.join(chr(o) for o in range(ord('a'), ord('a') + rows)))) print(df) df.to_parquet(path=f'mock-id-name-age-score-{i}.parquet', engine='pyarrow', compression='snappy', index=True) os.system(f'hadoop fs -rm -f /user/b_aip/zliu3/parquets/mock-id-name-age-score-{i}.parquet') os.system(f'hadoop fs -moveFromLocal mock-id-name-age-score-{i}.parquet /user/b_aip/zliu3/parquets')
标签:rows,string,df,random,range,Parquet,data From: https://www.cnblogs.com/dissipate/p/16721431.html