mirror of https://github.com/apache/druid.git
fix broken tests
This commit is contained in:
parent
0c2b1e6c3e
commit
f00ffe4789
|
@ -29,9 +29,10 @@ import com.fasterxml.jackson.annotation.JsonTypeInfo;
|
||||||
property = "type",
|
property = "type",
|
||||||
defaultImpl = LegacyDataSource.class)
|
defaultImpl = LegacyDataSource.class)
|
||||||
@JsonSubTypes({
|
@JsonSubTypes({
|
||||||
@JsonSubTypes.Type(value = TableDataSource.class, name = "table"),
|
@JsonSubTypes.Type(value = TableDataSource.class, name = "table"),
|
||||||
@JsonSubTypes.Type(value = QueryDataSource.class, name = "query")
|
@JsonSubTypes.Type(value = QueryDataSource.class, name = "query")
|
||||||
})
|
})
|
||||||
public interface DataSource
|
public interface DataSource
|
||||||
{
|
{
|
||||||
|
public String getName();
|
||||||
}
|
}
|
||||||
|
|
|
@ -37,6 +37,12 @@ public class QueryDataSource implements DataSource
|
||||||
this.query = query;
|
this.query = query;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String getName()
|
||||||
|
{
|
||||||
|
return query.getDataSource().getName();
|
||||||
|
}
|
||||||
|
|
||||||
@JsonProperty
|
@JsonProperty
|
||||||
public Query getQuery()
|
public Query getQuery()
|
||||||
{
|
{
|
||||||
|
@ -48,12 +54,18 @@ public class QueryDataSource implements DataSource
|
||||||
@Override
|
@Override
|
||||||
public boolean equals(Object o)
|
public boolean equals(Object o)
|
||||||
{
|
{
|
||||||
if (this == o) return true;
|
if (this == o) {
|
||||||
if (o == null || getClass() != o.getClass()) return false;
|
return true;
|
||||||
|
}
|
||||||
|
if (o == null || getClass() != o.getClass()) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
QueryDataSource that = (QueryDataSource) o;
|
QueryDataSource that = (QueryDataSource) o;
|
||||||
|
|
||||||
if (!query.equals(that.query)) return false;
|
if (!query.equals(that.query)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
|
@ -37,6 +37,7 @@ public class TableDataSource implements DataSource
|
||||||
}
|
}
|
||||||
|
|
||||||
@JsonProperty
|
@JsonProperty
|
||||||
|
@Override
|
||||||
public String getName()
|
public String getName()
|
||||||
{
|
{
|
||||||
return name;
|
return name;
|
||||||
|
@ -47,12 +48,18 @@ public class TableDataSource implements DataSource
|
||||||
@Override
|
@Override
|
||||||
public boolean equals(Object o)
|
public boolean equals(Object o)
|
||||||
{
|
{
|
||||||
if (this == o) return true;
|
if (this == o) {
|
||||||
if (!(o instanceof TableDataSource)) return false;
|
return true;
|
||||||
|
}
|
||||||
|
if (!(o instanceof TableDataSource)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
TableDataSource that = (TableDataSource) o;
|
TableDataSource that = (TableDataSource) o;
|
||||||
|
|
||||||
if (!name.equals(that.name)) return false;
|
if (!name.equals(that.name)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
|
@ -80,7 +80,7 @@ public class ConstantPostAggregator implements PostAggregator
|
||||||
return name;
|
return name;
|
||||||
}
|
}
|
||||||
|
|
||||||
@JsonProperty
|
@JsonProperty("value")
|
||||||
public Number getConstantValue()
|
public Number getConstantValue()
|
||||||
{
|
{
|
||||||
return constantValue;
|
return constantValue;
|
||||||
|
|
Binary file not shown.
|
@ -715,7 +715,7 @@ that the implementation cost is not worth the investment for our organization.
|
||||||
The reasons for this decision are generally two-fold.
|
The reasons for this decision are generally two-fold.
|
||||||
|
|
||||||
\begin{enumerate}
|
\begin{enumerate}
|
||||||
\item Scaling join queries has been, in our professional experience, a constant bottleneck of working with distributed databases
|
\item Scaling join queries has been, in our professional experience, a constant bottleneck of working with distributed databases.
|
||||||
\item The incremental gains in functionality are perceived to be of less value than the anticipated problems with managing highly concurrent, join-heavy workloads.
|
\item The incremental gains in functionality are perceived to be of less value than the anticipated problems with managing highly concurrent, join-heavy workloads.
|
||||||
\end{enumerate}
|
\end{enumerate}
|
||||||
|
|
||||||
|
@ -733,7 +733,7 @@ order or in a hash table form.
|
||||||
When all sides of the join are significantly large tables (> 1 billion records),
|
When all sides of the join are significantly large tables (> 1 billion records),
|
||||||
materializing the pre-join streams requires complex distributed memory
|
materializing the pre-join streams requires complex distributed memory
|
||||||
management. The complexity of the memory management is only amplified by
|
management. The complexity of the memory management is only amplified by
|
||||||
the fact that we are targeting highly concurrent, multi-tenant workloads.
|
the fact that we are targeting highly concurrent, multitenant workloads.
|
||||||
This is, as far as the authors are aware, an active academic research
|
This is, as far as the authors are aware, an active academic research
|
||||||
problem that we would be more than willing to engage with the academic
|
problem that we would be more than willing to engage with the academic
|
||||||
community to help resolving in a scalable manner.
|
community to help resolving in a scalable manner.
|
||||||
|
@ -949,11 +949,11 @@ Druid is often used to explore data and generate reports on data. In the
|
||||||
explore use case, the number of queries issued by a single user is much higher
|
explore use case, the number of queries issued by a single user is much higher
|
||||||
than in the reporting use case. Exploratory queries often involve progressively
|
than in the reporting use case. Exploratory queries often involve progressively
|
||||||
adding filters for the same time range to narrow down results. Users tend to
|
adding filters for the same time range to narrow down results. Users tend to
|
||||||
explore short time intervals of recent data. In the reporting use case, users
|
explore short time intervals of recent data. In the generate report use case,
|
||||||
query for a much larger data interval, but already have a set of queries in
|
users query for much longer data intervals, but users also already have the
|
||||||
mind.
|
queries they want to issue in mind.
|
||||||
|
|
||||||
\paragraph{Multitenant Workload}
|
\paragraph{Multitenancy}
|
||||||
Expensive concurrent queries can be problematic in a multitenant
|
Expensive concurrent queries can be problematic in a multitenant
|
||||||
environment. Queries for large datasources may end up hitting every historical
|
environment. Queries for large datasources may end up hitting every historical
|
||||||
node in a cluster and consume all cluster resources. Smaller, cheaper queries
|
node in a cluster and consume all cluster resources. Smaller, cheaper queries
|
||||||
|
@ -965,22 +965,23 @@ reporting use cases, and users are not expecting the same level of
|
||||||
interactivity as when they are querying to explore data.
|
interactivity as when they are querying to explore data.
|
||||||
|
|
||||||
\paragraph{Node failures}
|
\paragraph{Node failures}
|
||||||
Node failures are common in a distributed environment, but many nodes at
|
Single node failures are common in distributed environments, but many nodes
|
||||||
once failing are not. If historical nodes fail and do not recover, their
|
failing at once are not. If historical nodes completely fail and do not
|
||||||
segments need to reassigned, which means we need excess cluster capacity to
|
recover, their segments need to reassigned, which means we need excess cluster
|
||||||
load this data. The amount of additional capacity to have at any time is a
|
capacity to load this data. The amount of additional capacity to have at any
|
||||||
factor of cost. It is extremely rare to see more than 2 nodes fail at once and
|
time contributes to the cost of running a cluster. From our experiences, it is
|
||||||
never recover and hence, we leave enough capacity to completely reassign the
|
extremely rare to see more than 2 nodes completely fail at once and hence, we
|
||||||
data from 2 historical nodes.
|
leave enough capacity in our cluster to completely reassign the data from 2
|
||||||
|
historical nodes.
|
||||||
|
|
||||||
\paragraph{Data Center Outages}
|
\paragraph{Data Center Outages}
|
||||||
Complete cluster failures are possible, but extremely rare. When running
|
Complete cluster failures are possible, but extremely rare. If Druid is
|
||||||
in a single data center, it is possible for the entire data center to fail. In
|
deployed only in a single data center, it is possible for the entire data
|
||||||
such a case, a new cluster needs to be created. As long as deep storage is
|
center to fail. In such cases, new machines need to be provisioned. As long as
|
||||||
available, cluster recovery time is network bound. Historical nodes need to
|
deep storage is still available, cluster recovery time is network bound as
|
||||||
reload every segment from deep storage. We have experienced such a failure in
|
historical nodes simply need to redownload every segment from deep storage. We
|
||||||
the past, and it took several hours for our entire Druid cluster to recover on
|
have experienced such failures in the past, and the recovery time was around
|
||||||
several TBs of data.
|
several hours in the AWS ecosystem on several TBs of data.
|
||||||
|
|
||||||
\subsection{Operational Monitoring}
|
\subsection{Operational Monitoring}
|
||||||
Proper monitoring is critical to run a large scale distributed cluster.
|
Proper monitoring is critical to run a large scale distributed cluster.
|
||||||
|
|
Loading…
Reference in New Issue