利用歸并排序算法對大文件進行排序

jopen 9年前發布 | 21K 次閱讀 算法

原文  http://ivarchen.iteye.com/blog/2179500

歸并排序算法介紹,請參照Wikipeida

zh.wikipedia.org/wiki/%E5%BD%92%E5%B9%B6%E6%8E%92%E5%BA%8F

基本思想:

大文件分割成行數相等的兩個小文件,使用遞歸一直分割到所有所有小文件低于限制行數

小文件直接排序

兩個排序好的小文件歸并到大文件

直到最后所有排序好的文件歸并到輸入的大文件并返回

之前看了網上很多示例代碼,寫的很不簡潔, 引入了過多的臨時變量i, j, k等等, 導致程序基本沒法看,

只好自己寫了一個,沒有很關心執行效率, 只求夠用,以后有機會再優化一下吧。

JDK要求Java 8

package com.java.sort.merge;
import com.google.common.base.Charsets;
import com.google.common.base.Strings;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Iterators;
import com.google.common.collect.PeekingIterator;
import com.google.common.io.Files;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import org.apache.commons.io.LineIterator;
import org.apache.commons.io.filefilter.AndFileFilter;
import org.apache.commons.io.filefilter.PrefixFileFilter;
import org.apache.commons.io.filefilter.SuffixFileFilter;
import org.junit.AfterClass;
import org.junit.BeforeClass;
import org.junit.Test;
import java.io.File;
import java.io.FilenameFilter;
import java.io.IOException;
import java.io.Writer;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
public class FileMergeSort {
  private static int limit = 9999;
  private static void cleanTempFiles() {
    FilenameFilter filter = new AndFileFilter(ImmutableList.of(new PrefixFileFilter("sort"), new SuffixFileFilter(".part")));
    ImmutableList.copyOf(FileUtils.getTempDirectory().listFiles(filter)).forEach(File::delete);
  }
  private static int lineNumber(File input) throws IOException {
    int count = 0;
    LineIterator iterator = FileUtils.lineIterator(input);
    while (iterator.hasNext()) {
      iterator.next();
      count++;
    }
    return count;
  }
  private static File split(File input, int from, int to) throws IOException {
    File part = File.createTempFile("sort", ".part");
    Long lineNumber = 0L;
    String line = null;
    List<String> lines = new ArrayList<>(to - from);
    LineIterator iterator = FileUtils.lineIterator(input);
    while (iterator.hasNext()) {
      if (lineNumber > to) break;
      line = iterator.next();
      if (lineNumber >= from && lineNumber <= to) {
        lines.add(line);
      }
      lineNumber++;
    }
    FileUtils.writeLines(part, lines);
    return part;
  }
  private static File merge(File source, File left, File right) throws IOException {
    PeekingIterator<String> leftLineIterator = Iterators.peekingIterator(FileUtils.lineIterator(left));
    PeekingIterator<String> rightLineIterator = Iterators.peekingIterator(FileUtils.lineIterator(right));
    String leftLine, rightLine;
    try (Writer writer = Files.newWriter(source, Charsets.UTF_8)) {
      writer.write("");
      while (leftLineIterator.hasNext() && rightLineIterator.hasNext()) {
        leftLine = leftLineIterator.peek();
        rightLine = rightLineIterator.peek();
        if (leftLine.compareTo(rightLine) < 0) {
          writer.append(leftLine.concat(IOUtils.LINE_SEPARATOR));
          leftLineIterator.next();
        } else {
          writer.append(rightLine.concat(IOUtils.LINE_SEPARATOR));
          rightLineIterator.next();
        }
      }
      while (leftLineIterator.hasNext()) {
        writer.append(leftLineIterator.next().concat(IOUtils.LINE_SEPARATOR));
      }
      while (rightLineIterator.hasNext()) {
        writer.append(rightLineIterator.next().concat(IOUtils.LINE_SEPARATOR));
      }
    }
    return source;
  }
  private static File directSort(File input) throws IOException {
    List<String> list = new ArrayList<>(limit);
    FileUtils.lineIterator(input).forEachRemaining(list::add);
    Collections.sort(list);
    FileUtils.writeLines(input, list);
    return input;
  }
  public static File mergeSort(File input) throws IOException {
    int total = lineNumber(input);
    if (total <= limit) {
      return directSort(input);
    }
    int half = total / 2;
    File left = mergeSort(split(input, 0, half));
    File right = mergeSort(split(input, half + 1, total));
    return merge(input, left, right);
  }
  @BeforeClass
  public static void init() throws IOException {
    cleanTempFiles();
    try (Writer writer = Files.newWriter(new File("long.txt"), Charsets.UTF_8)) {
      writer.write("");
      for (long i = 99999L; i > 0L; i--) {
        writer.append(Strings.padStart(String.valueOf(i), 5, '0').concat(IOUtils.LINE_SEPARATOR));
      }
    }
  }
  @AfterClass
  public static void clean() {
    cleanTempFiles();
  }
  @Test
  public void testSort() throws IOException {
    File sorted = mergeSort(new File("long.txt"));
  }
}

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
     xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
     xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  <modelVersion>4.0.0</modelVersion>
  <groupId>com.java.app</groupId>
  <artifactId>sample</artifactId>
  <version>1.0-SNAPSHOT</version>
  <dependencies>
    <dependency>
      <groupId>commons-io</groupId>
      <artifactId>commons-io</artifactId>
      <version>2.4</version>
    </dependency>
    <dependency>
      <groupId>org.apache.commons</groupId>
      <artifactId>commons-csv</artifactId>
      <version>1.1</version>
    </dependency>
    <dependency>
      <groupId>com.google.guava</groupId>
      <artifactId>guava</artifactId>
      <version>18.0</version>
    </dependency>
    <dependency>
      <groupId>javax.servlet</groupId>
      <artifactId>javax.servlet-api</artifactId>
      <version>3.1.0</version>
    </dependency>
    <dependency>
      <groupId>org.apache.logging.log4j</groupId>
      <artifactId>log4j-api</artifactId>
      <version>2.1</version>
    </dependency>
    <dependency>
      <groupId>org.apache.logging.log4j</groupId>
      <artifactId>log4j-core</artifactId>
      <version>2.1</version>
    </dependency>
    <dependency>
      <groupId>org.apache.logging.log4j</groupId>
      <artifactId>log4j-jcl</artifactId>
      <version>2.1</version>
    </dependency>
    <dependency>
      <groupId>junit</groupId>
      <artifactId>junit</artifactId>
      <version>4.12</version>
      <scope>test</scope>
    </dependency>
  </dependencies>
</project>

 本文由用戶 jopen 自行上傳分享,僅供網友學習交流。所有權歸原作者,若您的權利被侵害,請聯系管理員。
 轉載本站原創文章,請注明出處,并保留原始鏈接、圖片水印。
 本站是一個以用戶分享為主的開源技術平臺,歡迎各類分享!