若文件存的字符如下图,要求进行去重
可将数据存入HashSet,如下,但如果文件很大,大于虚拟机内存的话,会报异常java.lang.OutOfMemoryError: Java heap space
HashSet set = new HashSet(); File file = new File("E:\\aa.txt"); BufferedReader reader = new BufferedReader(new FileReader(file)); String tempString = null; while ((tempString = reader.readLine()) != null) { tempString = tempString.trim(); if(tempString != ""){ System.out.println(tempString); set.add(tempString); } }
可尝试用分批读取,用Hash取模方法将大文件拆分成若干小文件,再将若干个小文件的数据存入HashSet,最后汇总结果
首先插入测试数据aa.txt
//多线程插入测试数据 public void set() throws FileNotFoundException { File file = new File("E:\\aa.txt"); PrintWriter pws = new PrintWriter(file); CountDownLatch latch = new CountDownLatch(9); ExecutorService executorService = Executors.newFixedThreadPool(9); for(int i=0;i<9;i++){ executorService.execute(new SetClass("name+"+UUID.randomUUID().toString(),latch,file,pws)); } try { latch.await(); //线程阻塞, 当latch中数量为0时,放行 } catch (InterruptedException e) { e.printStackTrace(); } executorService.shutdown(); //关闭线程 pws.close(); } public class SetClass extends Thread{ private final CountDownLatch countDownLatch; private File file; private PrintWriter pws; public SetClass(String name, CountDownLatch countDownLatch1,File file,PrintWriter pws){ super(name); this.countDownLatch = countDownLatch1; this.file = file; this.pws=pws; } @Override public void run() { for(int i=0;i<100000;i++){ pws.println(UUID.randomUUID().toString()); System.out.println(Thread.currentThread().getName()+":"+i); } countDownLatch.countDown(); } }
大文件进行拆分,利用Hash取模将重复的数据存入同一个小文件
/** * 将文件hash取模之后放到不同的小文件中 * @param targetFile 要去重的文件路径 * @param splitSize 将目标文件切割成多少份hash取模的小文件个数 * @return */ public static File[] splitFile(String targetFile,int splitSize){ File file = new File(targetFile); BufferedReader reader = null; PrintWriter[] pws = new PrintWriter[splitSize]; File[] littleFiles = new File[splitSize]; String parentPath = file.getParent(); File tempFolder = new File(parentPath + File.separator + "test"); if(!tempFolder.exists()){ tempFolder.mkdir(); } for(int i=0;i<splitSize;i++){ littleFiles[i] = new File(tempFolder.getAbsolutePath() + File.separator + i + ".txt"); if(littleFiles[i].exists()){ littleFiles[i].delete(); } try { pws[i] = new PrintWriter(littleFiles[i]); } catch (FileNotFoundException e) { e.printStackTrace(); } } try { reader = new BufferedReader(new FileReader(file)); String tempString = null; while ((tempString = reader.readLine()) != null) { // reader.readLine()逐行读取,避免一次性读完整个文件 tempString = tempString.trim(); if(tempString != ""){ //关键是将每行数据hash取模之后放到对应取模值的文件中,确保hash值相同的字符串都在同一个文件里面 int index = Math.abs(tempString.hashCode() % splitSize); pws[index].println(tempString); } } } catch (Exception e) { e.printStackTrace(); } finally { if (reader != null) { try { reader.close(); } catch (IOException e1) { e1.printStackTrace(); } } for(int i=0;i<splitSize;i++){ if(pws[i] != null){ pws[i].close(); } } } return littleFiles; }
对小文件进行去重并合并结果
/** * 对小文件进行去重合并 * @param littleFiles 切割之后的小文件数组 * @param distinctFilePath 去重之后的文件路径 * @param splitSize 小文件大小 */ public static void distinct(File[] littleFiles,String distinctFilePath,int splitSize){ File distinctedFile = new File(distinctFilePath); FileReader[] frs = new FileReader[splitSize]; BufferedReader[] brs = new BufferedReader[splitSize]; PrintWriter pw = null; try { if(distinctedFile.exists()){ distinctedFile.delete(); } distinctedFile.createNewFile(); pw = new PrintWriter(distinctedFile); Set<String> unicSet = new HashSet<String>(); for(int i=0;i<splitSize;i++){ if(littleFiles[i].exists()){ System.out.println("开始对小文件:" + littleFiles[i].getName() + "去重"); frs[i] = new FileReader(littleFiles[i]); brs[i] = new BufferedReader(frs[i]); String line = null; while((line = brs[i].readLine())!=null){ if(line != ""){ unicSet.add(line); } } for(String s:unicSet){ pw.println(s); } unicSet.clear(); System.gc(); } } } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e1){ e1.printStackTrace(); } finally { for(int i=0;i<splitSize;i++){ try { if(null != brs[i]){ brs[i].close(); } if(null != frs[i]){ frs[i].close(); } } catch (IOException e) { e.printStackTrace(); } //合并完成之后删除临时小文件 if(littleFiles[i].exists()){ littleFiles[i].delete(); } } if(null != pw){ pw.close(); } } }
标签:文件,splitSize,PrintWriter,File,tempString,new From: https://www.cnblogs.com/lhc-hhh/p/18430278