scala - स्पार्क में विभिन्न सम्मिलित प्रकार क्या हैं?




apache-spark apache-spark-sql (2)

मैंने डॉक्स को देखा और यह कहता है कि निम्नलिखित प्रकार शामिल हैं:

प्रदर्शन में शामिल होने का प्रकार। डिफ़ॉल्ट आंतरिक। इनमें से एक होना चाहिए: इनर, क्रॉस, आउटर, फुल, राउटर, लेफ्ट, राउटर, राईट, राईट, राउटर, लेफ्ट_समी, लेफ्ट_आंटी।

मैंने एसक्यूएल left_semi left_anti उत्तर को देखा और उत्तर के शीर्ष जोड़े ने ऊपर से कुछ left_semi का उल्लेख नहीं किया जैसे कि left_semi और left_anti । स्पार्क में उनका क्या मतलब है?


प्यार पथिक का उदाहरण। यहां स्पार्क v2 और डेटाफ्रेम का उपयोग करते हुए क्रॉस-जॉइन सहित जावा में एक संभावित अनुवाद है।

package net.jgp.books.sparkInAction.ch12.lab940AllJoins;

import java.util.ArrayList;
import java.util.List;

import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.RowFactory;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;

/**
 * All joins in a single app, inspired by
 * https://.com/questions/45990633/what-are-the-various-join-types-in-spark.
 * 
 * Used in Spark in Action 2e, http://jgp.net/sia
 * 
 * @author jgp
 */
public class AllJoinsApp {

  /**
   * main() is your entry point to the application.
   * 
   * @param args
   */
  public static void main(String[] args) {
    AllJoinsApp app = new AllJoinsApp();
    app.start();
  }

  /**
   * The processing code.
   */
  private void start() {
    // Creates a session on a local master
    SparkSession spark = SparkSession.builder()
        .appName("Processing of invoices")
        .master("local")
        .getOrCreate();

    StructType schema = DataTypes.createStructType(new StructField[] {
        DataTypes.createStructField(
            "id",
            DataTypes.IntegerType,
            false),
        DataTypes.createStructField(
            "value",
            DataTypes.StringType,
            false) });

    List<Row> rows = new ArrayList<Row>();
    rows.add(RowFactory.create(1, "A1"));
    rows.add(RowFactory.create(2, "A2"));
    rows.add(RowFactory.create(3, "A3"));
    rows.add(RowFactory.create(4, "A4"));
    Dataset<Row> dfLeft = spark.createDataFrame(rows, schema);
    dfLeft.show();

    rows = new ArrayList<Row>();
    rows.add(RowFactory.create(3, "A3"));
    rows.add(RowFactory.create(4, "A4"));
    rows.add(RowFactory.create(4, "A4_1"));
    rows.add(RowFactory.create(5, "A5"));
    rows.add(RowFactory.create(6, "A6"));
    Dataset<Row> dfRight = spark.createDataFrame(rows, schema);
    dfRight.show();

    String[] joinTypes = new String[] { 
        "inner", // v2.0.0. default
        "cross", // v2.2.0
        "outer", // v2.0.0
        "full", // v2.1.1
        "full_outer", // v2.1.1
        "left", // v2.1.1
        "left_outer", // v2.0.0
        "right", // v2.1.1
        "right_outer", // v2.0.0
        "left_semi", // v2.0.0, was leftsemi before v2.1.1
        "left_anti" // v2.1.1
        };

    for (String joinType : joinTypes) {
      System.out.println(joinType.toUpperCase() + " JOIN");
      Dataset<Row> df = dfLeft.join(
          dfRight, 
          dfLeft.col("id").equalTo(dfRight.col("id")), 
          joinType);
      df.orderBy(dfLeft.col("id")).show();
    }
  }
}

मैं इस उदाहरण को स्पार्क इन एक्शन, 2 ई के अध्याय 12 के भंडार में डालूंगा।


यहाँ एक सरल चित्रण प्रयोग है:

import org.apache.spark._
import org.apache.spark.sql._
import org.apache.spark.sql.expressions._
import org.apache.spark.sql.functions._

object SparkSandbox extends App {

  case class Row(id: Int, value: String)

  private[this] implicit val spark = SparkSession.builder().master("local[*]").getOrCreate()
  import spark.implicits._
  spark.sparkContext.setLogLevel("ERROR")

  val r1 = Seq(Row(1, "A1"), Row(2, "A2"), Row(3, "A3"), Row(4, "A4")).toDS()
  val r2 = Seq(Row(3, "A3"), Row(4, "A4"), Row(4, "A4_1"), Row(5, "A5"), Row(6, "A6")).toDS()

  val joinTypes = Seq("inner", "outer", "full", "full_outer", "left", "left_outer", "right", "right_outer", "left_semi", "left_anti")

  joinTypes foreach {joinType =>
    println(s"${joinType.toUpperCase()} JOIN")
    r1.join(right = r2, usingColumns = Seq("id"), joinType = joinType).orderBy("id").show()
  }
}

उत्पादन

 INNER JOIN
+---+-----+-----+
| id|value|value|
+---+-----+-----+
|  3|   A3|   A3|
|  4|   A4| A4_1|
|  4|   A4|   A4|
+---+-----+-----+

OUTER JOIN
+---+-----+-----+
| id|value|value|
+---+-----+-----+
|  1|   A1| null|
|  2|   A2| null|
|  3|   A3|   A3|
|  4|   A4|   A4|
|  4|   A4| A4_1|
|  5| null|   A5|
|  6| null|   A6|
+---+-----+-----+

FULL JOIN
+---+-----+-----+
| id|value|value|
+---+-----+-----+
|  1|   A1| null|
|  2|   A2| null|
|  3|   A3|   A3|
|  4|   A4| A4_1|
|  4|   A4|   A4|
|  5| null|   A5|
|  6| null|   A6|
+---+-----+-----+

FULL_OUTER JOIN
+---+-----+-----+
| id|value|value|
+---+-----+-----+
|  1|   A1| null|
|  2|   A2| null|
|  3|   A3|   A3|
|  4|   A4| A4_1|
|  4|   A4|   A4|
|  5| null|   A5|
|  6| null|   A6|
+---+-----+-----+

LEFT JOIN
+---+-----+-----+
| id|value|value|
+---+-----+-----+
|  1|   A1| null|
|  2|   A2| null|
|  3|   A3|   A3|
|  4|   A4| A4_1|
|  4|   A4|   A4|
+---+-----+-----+

LEFT_OUTER JOIN
+---+-----+-----+
| id|value|value|
+---+-----+-----+
|  1|   A1| null|
|  2|   A2| null|
|  3|   A3|   A3|
|  4|   A4| A4_1|
|  4|   A4|   A4|
+---+-----+-----+

RIGHT JOIN
+---+-----+-----+
| id|value|value|
+---+-----+-----+
|  3|   A3|   A3|
|  4|   A4|   A4|
|  4|   A4| A4_1|
|  5| null|   A5|
|  6| null|   A6|
+---+-----+-----+

RIGHT_OUTER JOIN
+---+-----+-----+
| id|value|value|
+---+-----+-----+
|  3|   A3|   A3|
|  4|   A4| A4_1|
|  4|   A4|   A4|
|  5| null|   A5|
|  6| null|   A6|
+---+-----+-----+

LEFT_SEMI JOIN
+---+-----+
| id|value|
+---+-----+
|  3|   A3|
|  4|   A4|
+---+-----+

LEFT_ANTI JOIN
+---+-----+
| id|value|
+---+-----+
|  1|   A1|
|  2|   A2|
+---+-----+






apache-spark-2.0