Aggregate data automatically

Data pipelines automatically transform raw sensor readings into summaries and insights at a schedule that you choose. Viam stores the output of these pipelines in a separate, queryable database.

For example, you may often query the average temperature across multiple sensors for each hour of the day. To make these queries faster, you can use a data pipeline to pre-calculate the results, saving significant computational resources.

Prerequisites

While not a requirement, it is easier to test data pipelines if you have already enabled data capture from at least one component and begun syncing data with Viam before setting up a pipeline.

Only users with organization owner permissions can create a data pipeline.

Pipeline management

Create a pipeline

To define a data pipeline, specify a name, organization, schedule, data source type, and query:

Use datapipelines create to create your pipeline:

viam datapipelines create \
  --org-id=<org-id> \
  --name=sensor-counts \
  --schedule="0 * * * *" \
  --data-source-type="standard" \
  --mql='[{"$match": {"component_name": "sensor"}}, {"$group": {"_id": "$location_id", "avg_temp": {"$avg": "$data.readings.temperature"}, "count": {"$sum": 1}}, {"$project": {"location": "$_id", "avg_temp": 1, "count": 1}}]'

To pass your query as a file instead of specifying it as inline MQL, pass the --mql-path flag instead of --mql. To create a pipeline that reads data from the hot data store, specify --data-source-type hotstorage.

To define a new pipeline, call DataClient.CreateDataPipeline:

from viam import DataClient
import bson

data_client = DataClient.from_api_key(
    api_key="<api-key>",
    api_key_id="<api-key-id>"
)

request = data_client.create_data_pipeline(
    organization_id="<org-id>",
    name="hourly-temp-average",
    mql_binary=[
        bson.encode({"$match": {"component_name": "temperature-sensor"}}),
        bson.encode({
            "$group": {
                "_id": "$location_id",
                "avg_temp": {"$avg": "$data.readings.temperature"},
                "count": {"$sum": 1}
            }
        }),
        bson.encode({
            "$project": {
                "location": "$_id",
                "avg_temp": 1,
                "count": 1
            }
        })
    ],
    schedule="0 * * * *"  # Run hourly
)

To create a pipeline that reads data from the hot data store, set your query’s data_source to TabularDataSourceType.TABULAR_DATA_SOURCE_TYPE_HOT_STORAGE.

To define a new pipeline, call DataClient.CreateDataPipeline:

import (
    "context"
    "fmt"
    "go.viam.com/rdk/app/utils"
    datapb "go.viam.com/rdk/app/data/v1"
    "go.viam.com/rdk/app/data/v1/bson"
)

client, err := utils.NewViamClient(context.Background(), utils.WithAPIKey("<api-key>", "<api-key-id>"))
if err != nil {
    panic(err)
}
defer client.Close()

dataClient := client.DataClient()

pipeline := [][]byte{
    bson.Marshal(bson.M{"$match": bson.M{"component_name": "temperature-sensor"}}),
    bson.Marshal(bson.M{
        "$group": bson.M{
            "_id": "$location_id",
            "avg_temp": bson.M{"$avg": "$data.readings.temperature"},
            "count": bson.M{"$sum": 1},
        },
    }),
    bson.Marshal(bson.M{
        "$group": bson.M{
            "location": "$_id",
            "avg_temp": 1,
            "count": 1,
        },
    }),
}

resp, err := dataClient.CreateDataPipeline(context.Background(), &datapb.CreateDataPipelineRequest{
    OrganizationId: "<org-id>",
    Name: "hourly-temp-average",
    MqlBinary: pipeline,
    Schedule: "0 * * * *", // Run hourly
})

To create a pipeline that reads data from the hot data store, set your query’s data_source field to TabularDataSourceType.TABULAR_DATA_SOURCE_TYPE_HOT_STORAGE.

To define a new pipeline, call dataClient.CreateDataPipeline:

import { createViamClient } from "@viamrobotics/sdk";
import {
  TabularDataSource,
  TabularDataSourceType,
} from "@viamrobotics/sdk/dist/gen/app/data/v1/data_pb";
import { BSON } from "bson";

const apiKey = "<api-key>";
const apiKeyID = "<api-key-id>";

const client = await createViamClient({
  credential: {
    type: "api-key",
    payload: { key: apiKey, keyId: apiKeyID },
  },
});

const dataClient = client.dataClient;

const pipeline = [
  BSON.serialize({ $match: { component_name: "temperature-sensor" } }),
  BSON.serialize({
    $group: {
      _id: "$location_id",
      avg_temp: { $avg: "$data.readings.temperature" },
      count: { $sum: 1 },
    },
  }),
  BSON.serialize({
    $project: {
      location: "$_id",
      avg_temp: 1,
      count: 1,
    },
  }),
];

const response = await dataClient.createDataPipeline({
  organizationId: "<org-id>",
  name: "hourly-temp-average",
  mqlBinary: pipeline,
  schedule: "0 * * * *", // Run hourly
});

To create a pipeline that reads data from the hot data store, set your query’s dataSource field to TabularDataSourceType.TABULAR_DATA_SOURCE_TYPE_HOT_STORAGE.

Schedule format

To create a schedule for your pipeline, specify a cron expression in UTC. The schedule determines both execution frequency and the range of time queried by each execution. The following table contains some common schedules, which correspond to the listed execution frequencies and query time range:

ScheduleFrequencyQuery Time Range
0 * * * *HourlyPrevious hour
0 0 * * *DailyPrevious day
*/15 * * * *Every 15 minutesPrevious 15 minutes

Query limitations

Data pipeline queries only support a subset of MQL aggregation operators. For more information, see Supported aggregation operators.

Query pipeline results

To query the results of your data pipeline, call DataClient.TabularDataByMQL. Configure the data_source argument with the following fields:

  • type: TabularDataSourceType.TABULAR_DATA_SOURCE_TYPE_PIPELINE_SINK
  • pipeline_id: your pipeline ID
from viam import DataClient
from viam.app.data.v1.data_pb import TabularDataSource, TabularDataSourceType
import bson

data_client = DataClient.from_api_key(
    api_key="<api-key>",
    api_key_id="<api-key-id>"
)

results = data_client.tabular_data_by_mql(
    organization_id="<org-id>",
    mql_binary=[
        bson.encode({}),
    ],
    data_source=TabularDataSource(
        type=TabularDataSourceType.TABULAR_DATA_SOURCE_TYPE_PIPELINE_SINK,
        pipeline_id="<pipeline-id>"
    )
)

for document in results:
    print(document)

To query the results of your data pipeline, call DataClient.TabularDataByMQL. Configure the DataSource argument with the following fields:

  • Type: datapb.TabularDataSourceType_TABULAR_DATA_SOURCE_TYPE_PIPELINE_SINK
  • PipelineId: your pipeline’s ID
import (
    "context"
    "fmt"
    "go.viam.com/rdk/app/utils"
    datapb "go.viam.com/rdk/app/data/v1"
    "go.viam.com/rdk/app/data/v1/bson"
)

client, err := utils.NewViamClient(context.Background(), utils.WithAPIKey("<api-key>", "<api-key-id>"))
if err != nil {
    panic(err)
}
defer client.Close()

dataClient := client.DataClient()

query := [][]byte{
    bson.Marshal(bson.M{}),
}

resp, err := dataClient.TabularDataByMQL(context.Background(), &datapb.TabularDataByMQLRequest{
    OrganizationId: "<org-id>",
    MqlBinary: query,
    DataSource: &datapb.TabularDataSource{
        Type: datapb.TabularDataSourceType_TABULAR_DATA_SOURCE_TYPE_PIPELINE_SINK,
        PipelineId: proto.String("<pipeline-id>"),
    },
})

for _, doc := range resp.Data {
    fmt.Println(doc)
}

To query the results of your data pipeline, call dataClient.TabularDataByMQL. Configure the data_source argument with the following fields:

  • type: TabularDataSourceType.TABULAR_DATA_SOURCE_TYPE_PIPELINE_SINK
  • pipelineId: your pipeline’s ID
import { createViamClient } from '@viamrobotics/sdk';
import { TabularDataSource, TabularDataSourceType } from '@viamrobotics/sdk/dist/gen/app/data/v1/data_pb';
import { BSON } from 'bson';

const apiKey = "<api-key>";
const apiKeyID = "<api-key-id>";

const client = await createViamClient({
  credential: {
    type: "api-key",
    payload: { key: apiKey, keyId: apiKeyID },
  },
});

const dataClient = client.dataClient;

const query = [BSON.serialize({})];

const response = await dataClient.tabularDataByMQL({
  "<org-id>",
  query,
  false,
  new TabularDataSource({
    type: TabularDataSourceType.PIPELINE_SINK,
    pipelineId: "<pipeline-id>",
  }),
});

response.data.forEach((doc) => {
  console.log(BSON.deserialize(doc));
});

List pipelines

Use datapipelines list to fetch a list of pipeline configurations in an organization:

viam datapipelines list --org-id=<org-id>

Use DataClient.ListDataPipelines to fetch a list of pipeline configurations in an organization:

from viam import DataClient
import bson

data_client = DataClient.from_api_key(
    api_key="<api-key>",
    api_key_id="<api-key-id>"
)

pipelines = data_client.list_data_pipelines(organization_id="<org-id>")

for pipeline in pipelines:
    print(f"{pipeline.name} (id: {pipeline.id})")
    print(f"Schedule: {pipeline.schedule}")
    print(f"Enabled: {pipeline.enabled}")

Use DataClient.ListDataPipelines to fetch a list of pipeline configurations in an organization:

import (
    "context"
    "fmt"
    "go.viam.com/rdk/app/utils"
)

client, err := utils.NewViamClient(context.Background(), utils.WithAPIKey("<api-key>", "<api-key-id>"))
if err != nil {
    panic(err)
}
defer client.Close()

dataClient := client.DataClient()

resp, err := dataClient.ListDataPipelines(context.Background(), &datapb.ListDataPipelinesRequest{
    OrganizationId: "<org-id>",
})

for _, pipeline := range resp.DataPipelines {
    fmt.Printf("%s (id: %s)\n", pipeline.Name, pipeline.Id)
    fmt.Printf("Schedule: %s\n", pipeline.Schedule)
    fmt.Printf("Enabled: %v\n", pipeline.Enabled)
}

Use dataClient.ListDataPipelines to fetch a list of pipeline configurations in an organization:

import { createViamClient } from "@viamrobotics/sdk";
import { BSON } from "bson";

const apiKey = "<api-key>";
const apiKeyID = "<api-key-id>";

const client = await createViamClient({
  credential: {
    type: "api-key",
    payload: { key: apiKey, keyId: apiKeyID },
  },
});

const dataClient = client.dataClient;

const response = await dataClient.listDataPipelines({
  organizationId: "<org-id>",
});

response.dataPipelines.forEach((pipeline) => {
  console.log(`${pipeline.name} (id: ${pipeline.id})`);
  console.log(`Schedule: ${pipeline.schedule}`);
  console.log(`Enabled: ${pipeline.enabled}`);
});

Update a pipeline

Use datapipelines update to alter an existing data pipeline:

viam datapipelines update \
  --org-id=<org-id> \
  --id=<pipeline-id> \
  --schedule="0 * * * *" \
  --name="updated-name" \
  --mql='[{"$match": {"component_name": "sensor"}}, {"$group": {"_id": "$part_id", "total": {"$sum": "$1"}}, {"$project": {"part": "$_id", "avg_temp": 1, "count": 1}}]'

To pass your query from a file instead of from inline MQL, pass the --mql-path flag instead of --mql.

Use DataClient.UpdateDataPipeline to alter an existing data pipeline:

import (
    "context"
    "fmt"
    "go.viam.com/rdk/app/utils"
    "go.viam.com/rdk/app/data/v1/bson"
)

client, err := utils.NewViamClient(context.Background(), utils.WithAPIKey("<api-key>", "<api-key-id>"))
if err != nil {
    panic(err)
}
defer client.Close()

dataClient := client.DataClient()

pipeline := [][]byte{
    bson.Marshal(bson.M{"$match": bson.M{"component_name": "sensor"}}),
    bson.Marshal(bson.M{
        "$group": bson.M{
            "_id": "$part_id",
            "total": bson.M{"$sum": 1},
        },
    bson.Marshal(bson.M{
        "$group": bson.M{
            "part": "$_id",
            "total": 1,
        },
    }),
    }),
}

_, err := dataClient.UpdateDataPipeline(context.Background(), &datapb.UpdateDataPipelineRequest{
    Id: "<pipeline-id>",
    Name: "updated-name",
    MqlBinary: pipeline,
    Schedule: "0 * * * *",
})

Enable a pipeline

Use datapipelines enable to enable a disabled data pipeline:

viam datapipelines enable --id=<pipeline-id>

Use DataClient.EnableDataPipeline to enable a disabled data pipeline:

import (
    "context"
    "fmt"
    "go.viam.com/rdk/app/utils"
)

client, err := utils.NewViamClient(context.Background(), utils.WithAPIKey("<api-key>", "<api-key-id>"))
if err != nil {
    panic(err)
}
defer client.Close()

dataClient := client.DataClient()

_, err := dataClient.EnableDataPipeline(context.Background(), &datapb.EnableDataPipelineRequest{
    Id: "<pipeline-id>",
})

Disable a pipeline

Disabling a data pipeline lets you pause data pipeline execution without fully deleting the pipeline configuration from your organization. The pipeline immediately stops aggregating data. You can re-enable the pipeline at any time to resume execution. When a pipeline is re-enabled, Viam will not backfill missed time windows from the period of time when a pipeline was disabled.

Use datapipelines disable to disable a data pipeline:

viam datapipelines disable --id=<pipeline-id>

Use DataClient.DisableDataPipeline to disable a data pipeline:

import (
    "context"
    "fmt"
    "go.viam.com/rdk/app/utils"
)

client, err := utils.NewViamClient(context.Background(), utils.WithAPIKey("<api-key>", "<api-key-id>"))
if err != nil {
    panic(err)
}
defer client.Close()

dataClient := client.DataClient()

_, err := dataClient.DisableDataPipeline(context.Background(), &datapb.DisableDataPipelineRequest{
    Id: "<pipeline-id>",
})

Delete a pipeline

Use datapipelines delete to delete a data pipeline, its execution history, and all output generated by that pipeline:

viam datapipelines delete --id=<pipeline-id>

Use DataClient.DeleteDataPipeline to delete a data pipeline:

from viam import DataClient

data_client = DataClient.from_api_key(
    api_key="<api-key>",
    api_key_id="<api-key-id>"
)

data_client.delete_data_pipeline(id="<pipeline-id>")

Use DataClient.DeleteDataPipeline to delete a data pipeline:

import (
    "context"
    "fmt"
    "go.viam.com/rdk/app/utils"
)

client, err := utils.NewViamClient(context.Background(), utils.WithAPIKey("<api-key>", "<api-key-id>"))
if err != nil {
    panic(err)
}
defer client.Close()

dataClient := client.DataClient()

_, err := dataClient.DeleteDataPipeline(context.Background(), &datapb.DeleteDataPipelineRequest{
    Id: "<pipeline-id>",
})

Use dataClient.DeleteDataPipeline to delete a data pipeline:

import { createViamClient } from "@viamrobotics/sdk";

const apiKey = "<api-key>";
const apiKeyID = "<api-key-id>";

const client = await createViamClient({
  credential: {
    type: "api-key",
    payload: { key: apiKey, keyId: apiKeyID },
  },
});

const dataClient = client.dataClient;

await dataClient.deleteDataPipeline({ id: "<pipeline-id>" });

Check pipeline execution history

Data pipeline executions may have any of the following statuses:

  • SCHEDULED: pending execution
  • STARTED: currently processing
  • COMPLETED: successfully finished
  • FAILED: execution error

Use DataClient.ListDataPipelineRuns to view information about past and in-progress executions of a pipeline:

from viam import DataClient

data_client = DataClient.from_api_key(
    api_key="<api-key>",
    api_key_id="<api-key-id>"
)

runs = data_client.list_data_pipeline_runs(
    id="<pipeline-id>",
    page_size=10
)

for run in runs:
    print(f"Run {run.id}: {run.status}")
    print(f"Data window: {run.data_start_time} to {run.data_end_time}")
    print(f"Started: {run.started}, Ended: {run.ended}")

Use DataClient.ListDataPipelineRuns to view information about past executions of a pipeline:

import (
    "context"
    "fmt"
    "go.viam.com/rdk/app/utils"
)

client, err := utils.NewViamClient(context.Background(), utils.WithAPIKey("<api-key>", "<api-key-id>"))
if err != nil {
    panic(err)
}
defer client.Close()

dataClient := client.DataClient()

resp, err := dataClient.ListDataPipelineRuns(context.Background(), &datapb.ListDataPipelineRunsRequest{
    Id: "<pipeline-id>",
    PageSize: 10,
})

for _, run := range resp.Executions {
    fmt.Printf("Run %s: %s\n", run.Id, run.Status)
    fmt.Printf("Data window: %s to %s\n", run.DataStartTime, run.DataEndTime)
    fmt.Printf("Started: %s, Ended: %s\n", run.Started, run.Ended)
}

Use dataClient.ListDataPipelineRuns to view information about past executions of a pipeline:

import { createViamClient } from "@viamrobotics/sdk";

const apiKey = "<api-key>";
const apiKeyID = "<api-key-id>";

const client = await createViamClient({
  credential: {
    type: "api-key",
    payload: { key: apiKey, keyId: apiKeyID },
  },
});

const dataClient = client.dataClient;

const response = await dataClient.listDataPipelineRuns({
  id: "<pipeline-id>",
  pageSize: 10,
});

response.executions.forEach((run) => {
  console.log(`Run ${run.id}: ${run.status}`);
  console.log(`Data window: ${run.dataStartTime} to ${run.dataEndTime}`);
  console.log(`Started: ${run.started}, Ended: ${run.ended}`);
});