‫Google משתמשת בטכנולוגיית AI כדי לתרגם תוכן לשפה המועדפת עליך. בתרגומים כאלו עשויות להיות שגיאות.

מחבר Bigtable HBase Beam

כדי לעזור לכם להשתמש ב-Bigtable בפייפליין של Dataflow, זמינים שני מחברי קלט/פלט של Bigtable Beam בקוד פתוח.

אם אתם מבצעים מיגרציה מ-HBase ל-Bigtable או שהאפליקציה שלכם קוראת ל-HBase API, אתם יכולים להשתמש ב-Bigtable HBase Beam connector ‏(CloudBigtableIO) שמוסבר בדף הזה.

בכל המקרים האחרים, כדאי להשתמש במחבר Bigtable Beam ‏(BigtableIO) בשילוב עם לקוח Cloud Bigtable ל-Java, שפועל עם ממשקי Cloud Bigtable API. כדי להתחיל להשתמש במחבר הזה, אפשר לעיין במאמר Bigtable Beam connector.

מידע נוסף על מודל התכנות של Apache Beam זמין במסמכי התיעוד של Beam.

תחילת העבודה עם HBase

מחבר Bigtable HBase Beam כתוב ב-Java ומבוסס על לקוח Bigtable HBase ל-Java. הוא תואם ל-Dataflow SDK 2.x ל-Java, שמבוסס על Apache Beam. קוד המקור של המחבר נמצא ב-GitHub במאגר googleapis/java-bigtable-hbase.

בדף הזה מוסבר איך להשתמש בהמרות Read ו-Write.

מגדירים אימות

כדי להשתמש בדוגמאות של Java שבדף הזה בסביבת פיתוח מקומית, מתקינים ומפעילים את ה-CLI של gcloud, ואז מגדירים את Application Default Credentials באמצעות פרטי הכניסה של המשתמש.

התקינו את ה-CLI של Google Cloud.
אם אתם משתמשים בספק זהויות חיצוני (IdP), קודם אתם צריכים להיכנס ל-CLI של gcloud באמצעות המאגר המאוחד לניהול זהויות.
אם אתם משתמשים במעטפת מקומית, אתם צריכים ליצור פרטי כניסה לאימות מקומי עבור חשבון המשתמש:
```
gcloud auth application-default login
```
אם אתם משתמשים ב-Cloud Shell, אין צורך לבצע את הפעולה הזו.

אם מוחזרת שגיאת אימות ואתם משתמשים בספק זהויות חיצוני (IdP), ודאו ש נכנסתם ל-CLI של gcloud באמצעות המאגר המאוחד לניהול זהויות.

מידע נוסף זמין במאמר הגדרת אימות לסביבת פיתוח מקומית.

למידע נוסף על הגדרה של אימות בסביבת ייצור, ראו הגדרה של Application Default Credentials לקוד שפועל ב- Google Cloud .

הוספת המחבר לפרויקט Maven

כדי להוסיף את מחבר Bigtable HBase Beam לפרויקט Maven, מוסיפים את ארטיפקט Maven לקובץ pom.xml כתלות:

<dependency>
  <groupId>com.google.cloud.bigtable</groupId>
  <artifactId>bigtable-hbase-beam</artifactId>
  <version>2.12.0</version>
</dependency>

ציון ההגדרה של Bigtable

יוצרים ממשק אפשרויות כדי לאפשר קלט להרצת הפייפליין:

public interface BigtableOptions extends DataflowPipelineOptions {

  @Description("The Bigtable project ID, this can be different than your Dataflow project")
  @Default.String("bigtable-project")
  String getBigtableProjectId();

  void setBigtableProjectId(String bigtableProjectId);

  @Description("The Bigtable instance ID")
  @Default.String("bigtable-instance")
  String getBigtableInstanceId();

  void setBigtableInstanceId(String bigtableInstanceId);

  @Description("The Bigtable table ID in the instance.")
  @Default.String("mobile-time-series")
  String getBigtableTableId();

  void setBigtableTableId(String bigtableTableId);
}

כשקוראים מ-Bigtable או כותבים ל-Bigtable, צריך לספק אובייקט הגדרה של CloudBigtableConfiguration. האובייקט הזה מציין את מזהה הפרויקט ומזהה המופע של הטבלה, וגם את השם של הטבלה עצמה:

CloudBigtableTableConfiguration bigtableTableConfig =
    new CloudBigtableTableConfiguration.Builder()
        .withProjectId(options.getBigtableProjectId())
        .withInstanceId(options.getBigtableInstanceId())
        .withTableId(options.getBigtableTableId())
        .build();

לקריאה, צריך לספק CloudBigtableScanConfiguration אובייקט הגדרה, שמאפשר לציין אובייקט Apache HBase Scan שמגביל ומסנן את תוצאות הקריאה. פרטים נוספים מופיעים במאמר בנושא קריאה מ-Bigtable.

קריאה מ-Bigtable

כדי לקרוא מטבלת Bigtable, צריך להחיל Read טרנספורמציה על התוצאה של פעולת CloudBigtableIO.read. הטרנספורמציה Read מחזירה PCollection של אובייקטים של HBase Result, כאשר כל רכיב ב-PCollection מייצג שורה אחת בטבלה.

p.apply(Read.from(CloudBigtableIO.read(config)))
    .apply(
        ParDo.of(
            new DoFn<Result, Void>() {
              @ProcessElement
              public void processElement(@Element Result row, OutputReceiver<Void> out) {
                System.out.println(Bytes.toString(row.getRow()));
              }
            }));

כברירת מחדל, פעולת CloudBigtableIO.read מחזירה את כל השורות בטבלה. אפשר להשתמש באובייקט Scan של HBase כדי להגביל את הקריאה לטווח של מפתחות שורות בטבלה, או כדי להחיל מסננים על תוצאות הקריאה. כדי להשתמש באובייקט Scan, צריך לכלול אותו ב-CloudBigtableScanConfiguration.

לדוגמה, אפשר להוסיף פונקציה Scan שמחזירה רק את צמד מפתח/ערך הראשון מכל שורה בטבלה, וזה שימושי כשרוצים לספור את מספר השורות בטבלה:

import com.google.cloud.bigtable.beam.CloudBigtableIO;
import com.google.cloud.bigtable.beam.CloudBigtableScanConfiguration;
import org.apache.beam.runners.dataflow.options.DataflowPipelineOptions;
import org.apache.beam.sdk.Pipeline;
import org.apache.beam.sdk.io.Read;
import org.apache.beam.sdk.options.Default;
import org.apache.beam.sdk.options.Description;
import org.apache.beam.sdk.options.PipelineOptionsFactory;
import org.apache.beam.sdk.transforms.DoFn;
import org.apache.beam.sdk.transforms.ParDo;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.filter.FirstKeyOnlyFilter;
import org.apache.hadoop.hbase.util.Bytes;

public class HelloWorldRead {
  public static void main(String[] args) {
    BigtableOptions options =
        PipelineOptionsFactory.fromArgs(args).withValidation().as(BigtableOptions.class);
    Pipeline p = Pipeline.create(options);

    Scan scan = new Scan();
    scan.setCacheBlocks(false);
    scan.setFilter(new FirstKeyOnlyFilter());

    CloudBigtableScanConfiguration config =
        new CloudBigtableScanConfiguration.Builder()
            .withProjectId(options.getBigtableProjectId())
            .withInstanceId(options.getBigtableInstanceId())
            .withTableId(options.getBigtableTableId())
            .withScan(scan)
            .build();

    p.apply(Read.from(CloudBigtableIO.read(config)))
        .apply(
            ParDo.of(
                new DoFn<Result, Void>() {
                  @ProcessElement
                  public void processElement(@Element Result row, OutputReceiver<Void> out) {
                    System.out.println(Bytes.toString(row.getRow()));
                  }
                }));

    p.run().waitUntilFinish();
  }

  public interface BigtableOptions extends DataflowPipelineOptions {
    @Description("The Bigtable project ID, this can be different than your Dataflow project")
    @Default.String("bigtable-project")
    String getBigtableProjectId();

    void setBigtableProjectId(String bigtableProjectId);

    @Description("The Bigtable instance ID")
    @Default.String("bigtable-instance")
    String getBigtableInstanceId();

    void setBigtableInstanceId(String bigtableInstanceId);

    @Description("The Bigtable table ID in the instance.")
    @Default.String("mobile-time-series")
    String getBigtableTableId();

    void setBigtableTableId(String bigtableTableId);
  }
}

כתיבה ל-Bigtable

כדי לכתוב לטבלת Bigtable, צריך לבצע פעולת applyCloudBigtableIO.writeToTable. תצטרכו לבצע את הפעולה הזו על PCollection של אובייקטים של HBase Mutation, שיכולים לכלול אובייקטים של Put ושל Delete.

הטבלה ב-Bigtable צריכה להיות קיימת כבר, וצריך להגדיר בה את משפחות העמודות המתאימות. מחבר Dataflow לא יוצר טבלאות ומשפחות עמודות תוך כדי תנועה. אפשר להשתמש ב-CLI של cbt כדי ליצור טבלה ולהגדיר משפחות עמודות, או לעשות זאת באופן פרוגרמטי.

לפני שכותבים ל-Bigtable, צריך ליצור צינור Dataflow כדי שפעולות הכנסה ומחיקה יוכלו לעבור סריאליזציה ברשת:

BigtableOptions options =
    PipelineOptionsFactory.fromArgs(args).withValidation().as(BigtableOptions.class);
Pipeline p = Pipeline.create(options);

באופן כללי, תצטרכו לבצע טרנספורמציה, כמו ParDo, כדי לעצב את נתוני הפלט שלכם לאוסף של אובייקטים של HBase Put או Delete. בדוגמה הבאה מוצגת טרנספורמציה של DoFn שמשתמשת בערך הנוכחי כמפתח השורה של Put. לאחר מכן אפשר לכתוב את אובייקטי Put ל-Bigtable.

p.apply(Create.of("phone#4c410523#20190501", "phone#4c410523#20190502"))
    .apply(
        ParDo.of(
            new DoFn<String, Mutation>() {
              @ProcessElement
              public void processElement(@Element String rowkey, OutputReceiver<Mutation> out) {
                long timestamp = System.currentTimeMillis();
                Put row = new Put(Bytes.toBytes(rowkey));

                row.addColumn(
                    Bytes.toBytes("stats_summary"),
                    Bytes.toBytes("os_build"),
                    timestamp,
                    Bytes.toBytes("android"));
                out.output(row);
              }
            }))
    .apply(CloudBigtableIO.writeToTable(bigtableTableConfig));

כדי להפעיל את הבקרה על זרימת כתיבת נתונים במקבצים, צריך להגדיר את BIGTABLE_ENABLE_BULK_MUTATION_FLOW_CONTROL לערך true. התכונה הזו מגבילה באופן אוטומטי את קצב התעבורה לבקשות כתיבה של קבוצות, ומאפשרת להתאמה האוטומטית לעומס של Bigtable להוסיף או להסיר צמתים באופן אוטומטי כדי לטפל במשימת Dataflow.

CloudBigtableTableConfiguration bigtableTableConfig =
    new CloudBigtableTableConfiguration.Builder()
        .withProjectId(options.getBigtableProjectId())
        .withInstanceId(options.getBigtableInstanceId())
        .withTableId(options.getBigtableTableId())
        .withConfiguration(BigtableOptionsFactory.BIGTABLE_ENABLE_BULK_MUTATION_FLOW_CONTROL,
            "true")
        .build();
return bigtableTableConfig;

דוגמה מלאה לכתיבה, כולל וריאציה שמאפשרת בקרה על זרימת נתונים של כתיבת קבוצות:


import com.google.cloud.bigtable.beam.CloudBigtableIO;
import com.google.cloud.bigtable.beam.CloudBigtableTableConfiguration;
import com.google.cloud.bigtable.hbase.BigtableOptionsFactory;
import org.apache.beam.runners.dataflow.options.DataflowPipelineOptions;
import org.apache.beam.sdk.Pipeline;
import org.apache.beam.sdk.options.Default;
import org.apache.beam.sdk.options.Description;
import org.apache.beam.sdk.options.PipelineOptionsFactory;
import org.apache.beam.sdk.transforms.Create;
import org.apache.beam.sdk.transforms.DoFn;
import org.apache.beam.sdk.transforms.ParDo;
import org.apache.hadoop.hbase.client.Mutation;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.util.Bytes;

public class HelloWorldWrite {

  public static void main(String[] args) {
    BigtableOptions options =
        PipelineOptionsFactory.fromArgs(args).withValidation().as(BigtableOptions.class);
    Pipeline p = Pipeline.create(options);

    CloudBigtableTableConfiguration bigtableTableConfig =
        new CloudBigtableTableConfiguration.Builder()
            .withProjectId(options.getBigtableProjectId())
            .withInstanceId(options.getBigtableInstanceId())
            .withTableId(options.getBigtableTableId())
            .build();

    p.apply(Create.of("phone#4c410523#20190501", "phone#4c410523#20190502"))
        .apply(
            ParDo.of(
                new DoFn<String, Mutation>() {
                  @ProcessElement
                  public void processElement(@Element String rowkey, OutputReceiver<Mutation> out) {
                    long timestamp = System.currentTimeMillis();
                    Put row = new Put(Bytes.toBytes(rowkey));

                    row.addColumn(
                        Bytes.toBytes("stats_summary"),
                        Bytes.toBytes("os_build"),
                        timestamp,
                        Bytes.toBytes("android"));
                    out.output(row);
                  }
                }))
        .apply(CloudBigtableIO.writeToTable(bigtableTableConfig));

    p.run().waitUntilFinish();
  }

  public interface BigtableOptions extends DataflowPipelineOptions {

    @Description("The Bigtable project ID, this can be different than your Dataflow project")
    @Default.String("bigtable-project")
    String getBigtableProjectId();

    void setBigtableProjectId(String bigtableProjectId);

    @Description("The Bigtable instance ID")
    @Default.String("bigtable-instance")
    String getBigtableInstanceId();

    void setBigtableInstanceId(String bigtableInstanceId);

    @Description("The Bigtable table ID in the instance.")
    @Default.String("mobile-time-series")
    String getBigtableTableId();

    void setBigtableTableId(String bigtableTableId);
  }

  public static CloudBigtableTableConfiguration batchWriteFlowControlExample(
      BigtableOptions options) {
    CloudBigtableTableConfiguration bigtableTableConfig =
        new CloudBigtableTableConfiguration.Builder()
            .withProjectId(options.getBigtableProjectId())
            .withInstanceId(options.getBigtableInstanceId())
            .withTableId(options.getBigtableTableId())
            .withConfiguration(BigtableOptionsFactory.BIGTABLE_ENABLE_BULK_MUTATION_FLOW_CONTROL,
                "true")
            .build();
    return bigtableTableConfig;
  }
}