1 /**
2 * Licensed to the Apache Software Foundation (ASF) under one
3 * or more contributor license agreements. See the NOTICE file
4 * distributed with this work for additional information
5 * regarding copyright ownership. The ASF licenses this file
6 * to you under the Apache License, Version 2.0 (the
7 * "License"); you may not use this file except in compliance
8 * with the License. You may obtain a copy of the License at
9 *
10 * http://www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing, software
13 * distributed under the License is distributed on an "AS IS" BASIS,
14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 * See the License for the specific language governing permissions and
16 * limitations under the License.
17 */
18 package org.apache.hadoop.hbase.catalog;
19
20 import com.google.common.base.Stopwatch;
21 import org.apache.commons.logging.Log;
22 import org.apache.commons.logging.LogFactory;
23 import org.apache.hadoop.hbase.classification.InterfaceAudience;
24 import org.apache.hadoop.conf.Configuration;
25 import org.apache.hadoop.hbase.Abortable;
26 import org.apache.hadoop.hbase.HRegionInfo;
27 import org.apache.hadoop.hbase.NotAllMetaRegionsOnlineException;
28 import org.apache.hadoop.hbase.ServerName;
29 import org.apache.hadoop.hbase.client.HConnection;
30 import org.apache.hadoop.hbase.client.HConnectionManager;
31 import org.apache.hadoop.hbase.client.HTable;
32 import org.apache.hadoop.hbase.client.RetriesExhaustedException;
33 import org.apache.hadoop.hbase.ipc.RpcClient.FailedServerException;
34 import org.apache.hadoop.hbase.ipc.ServerNotRunningYetException;
35 import org.apache.hadoop.hbase.protobuf.ProtobufUtil;
36 import org.apache.hadoop.hbase.protobuf.generated.AdminProtos.AdminService;
37 import org.apache.hadoop.hbase.regionserver.RegionServerStoppedException;
38 import org.apache.hadoop.hbase.util.Bytes;
39 import org.apache.hadoop.hbase.zookeeper.MetaRegionTracker;
40 import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher;
41 import org.apache.hadoop.ipc.RemoteException;
42
43 import java.io.EOFException;
44 import java.io.IOException;
45 import java.net.ConnectException;
46 import java.net.NoRouteToHostException;
47 import java.net.SocketException;
48 import java.net.SocketTimeoutException;
49 import java.net.UnknownHostException;
50
51 /**
52 * Tracks the availability of the catalog tables
53 * <code>hbase:meta</code>.
54 *
55 * This class is "read-only" in that the locations of the catalog tables cannot
56 * be explicitly set. Instead, ZooKeeper is used to learn of the availability
57 * and location of <code>hbase:meta</code>.
58 *
59 * <p>Call {@link #start()} to start up operation. Call {@link #stop()}} to
60 * interrupt waits and close up shop.
61 */
62 @InterfaceAudience.Private
63 public class CatalogTracker {
64 // TODO JDC 11/30 We don't even have ROOT anymore, revisit
65 // TODO: This class needs a rethink. The original intent was that it would be
66 // the one-stop-shop for meta locations and that it would get this
67 // info from reading and watching zk state. The class was to be used by
68 // servers when they needed to know of meta movement but also by
69 // client-side (inside in HTable) so rather than figure meta
70 // locations on fault, the client would instead get notifications out of zk.
71 //
72 // But this original intent is frustrated by the fact that this class has to
73 // read an hbase table, the -ROOT- table, to figure out the hbase:meta region
74 // location which means we depend on an HConnection. HConnection will do
75 // retrying but also, it has its own mechanism for finding root and meta
76 // locations (and for 'verifying'; it tries the location and if it fails, does
77 // new lookup, etc.). So, at least for now, HConnection (or HTable) can't
78 // have a CT since CT needs a HConnection (Even then, do want HT to have a CT?
79 // For HT keep up a session with ZK? Rather, shouldn't we do like asynchbase
80 // where we'd open a connection to zk, read what we need then let the
81 // connection go?). The 'fix' is make it so both root and meta addresses
82 // are wholey up in zk -- not in zk (root) -- and in an hbase table (meta).
83 //
84 // But even then, this class does 'verification' of the location and it does
85 // this by making a call over an HConnection (which will do its own root
86 // and meta lookups). Isn't this verification 'useless' since when we
87 // return, whatever is dependent on the result of this call then needs to
88 // use HConnection; what we have verified may change in meantime (HConnection
89 // uses the CT primitives, the root and meta trackers finding root locations).
90 //
91 // When meta is moved to zk, this class may make more sense. In the
92 // meantime, it does not cohere. It should just watch meta and root and not
93 // NOT do verification -- let that be out in HConnection since its going to
94 // be done there ultimately anyways.
95 //
96 // This class has spread throughout the codebase. It needs to be reigned in.
97 // This class should be used server-side only, even if we move meta location
98 // up into zk. Currently its used over in the client package. Its used in
99 // MetaReader and MetaEditor classes usually just to get the Configuration
100 // its using (It does this indirectly by asking its HConnection for its
101 // Configuration and even then this is just used to get an HConnection out on
102 // the other end). I made https://issues.apache.org/jira/browse/HBASE-4495 for
103 // doing CT fixup. St.Ack 09/30/2011.
104 //
105
106 // TODO: Timeouts have never been as advertised in here and its worse now
107 // with retries; i.e. the HConnection retries and pause goes ahead whatever
108 // the passed timeout is. Fix.
109 private static final Log LOG = LogFactory.getLog(CatalogTracker.class);
110 private final HConnection connection;
111 private final ZooKeeperWatcher zookeeper;
112 private final MetaRegionTracker metaRegionTracker;
113 private boolean instantiatedzkw = false;
114 private Abortable abortable;
115
116 private boolean stopped = false;
117
118 static final byte [] META_REGION_NAME =
119 HRegionInfo.FIRST_META_REGIONINFO.getRegionName();
120
121 /**
122 * Constructs a catalog tracker. Find current state of catalog tables.
123 * Begin active tracking by executing {@link #start()} post construction. Does
124 * not timeout.
125 *
126 * @param conf
127 * the {@link Configuration} from which a {@link HConnection} will be
128 * obtained; if problem, this connections
129 * {@link HConnection#abort(String, Throwable)} will be called.
130 * @throws IOException
131 */
132 public CatalogTracker(final Configuration conf) throws IOException {
133 this(null, conf, null);
134 }
135
136 /**
137 * Constructs the catalog tracker. Find current state of catalog tables.
138 * Begin active tracking by executing {@link #start()} post construction.
139 * Does not timeout.
140 * @param zk If zk is null, we'll create an instance (and shut it down
141 * when {@link #stop()} is called) else we'll use what is passed.
142 * @param conf
143 * @param abortable If fatal exception we'll call abort on this. May be null.
144 * If it is we'll use the Connection associated with the passed
145 * {@link Configuration} as our Abortable.
146 * @throws IOException
147 */
148 public CatalogTracker(final ZooKeeperWatcher zk, final Configuration conf,
149 Abortable abortable)
150 throws IOException {
151 this(zk, conf, HConnectionManager.getConnection(conf), abortable);
152 }
153
154 public CatalogTracker(final ZooKeeperWatcher zk, final Configuration conf,
155 HConnection connection, Abortable abortable)
156 throws IOException {
157 this.connection = connection;
158 if (abortable == null) {
159 // A connection is abortable.
160 this.abortable = this.connection;
161 }
162 Abortable throwableAborter = new Abortable() {
163
164 @Override
165 public void abort(String why, Throwable e) {
166 throw new RuntimeException(why, e);
167 }
168
169 @Override
170 public boolean isAborted() {
171 return true;
172 }
173
174 };
175 if (zk == null) {
176 // Create our own. Set flag so we tear it down on stop.
177 this.zookeeper =
178 new ZooKeeperWatcher(conf, "catalogtracker-on-" + connection.toString(),
179 abortable);
180 instantiatedzkw = true;
181 } else {
182 this.zookeeper = zk;
183 }
184 this.metaRegionTracker = new MetaRegionTracker(zookeeper, throwableAborter);
185 }
186
187 /**
188 * Starts the catalog tracker.
189 * Determines current availability of catalog tables and ensures all further
190 * transitions of either region are tracked.
191 * @throws IOException
192 * @throws InterruptedException
193 */
194 public void start() throws IOException, InterruptedException {
195 LOG.debug("Starting catalog tracker " + this);
196 try {
197 this.metaRegionTracker.start();
198 } catch (RuntimeException e) {
199 Throwable t = e.getCause();
200 this.abortable.abort(e.getMessage(), t);
201 throw new IOException("Attempt to start meta tracker failed.", t);
202 }
203 }
204
205 /**
206 * Stop working.
207 * Interrupts any ongoing waits.
208 */
209 public void stop() {
210 if (!this.stopped) {
211 LOG.debug("Stopping catalog tracker " + this);
212 this.stopped = true;
213 this.metaRegionTracker.stop();
214 try {
215 if (this.connection != null) {
216 this.connection.close();
217 }
218 } catch (IOException e) {
219 // Although the {@link Closeable} interface throws an {@link
220 // IOException}, in reality, the implementation would never do that.
221 LOG.error("Attempt to close catalog tracker's connection failed.", e);
222 }
223 if (this.instantiatedzkw) {
224 this.zookeeper.close();
225 }
226 }
227 }
228
229 /**
230 * Gets the current location for <code>hbase:meta</code> or null if location is
231 * not currently available.
232 * @return {@link ServerName} for server hosting <code>hbase:meta</code> or null
233 * if none available
234 * @throws InterruptedException
235 */
236 public ServerName getMetaLocation() throws InterruptedException {
237 return this.metaRegionTracker.getMetaRegionLocation();
238 }
239
240 /**
241 * Checks whether meta regionserver znode has some non null data.
242 * @return true if data is not null, false otherwise.
243 */
244 public boolean isMetaLocationAvailable() {
245 return this.metaRegionTracker.isLocationAvailable();
246 }
247 /**
248 * Gets the current location for <code>hbase:meta</code> if available and waits
249 * for up to the specified timeout if not immediately available. Returns null
250 * if the timeout elapses before root is available.
251 * @param timeout maximum time to wait for root availability, in milliseconds
252 * @return {@link ServerName} for server hosting <code>hbase:meta</code> or null
253 * if none available
254 * @throws InterruptedException if interrupted while waiting
255 * @throws NotAllMetaRegionsOnlineException if meta not available before
256 * timeout
257 */
258 public ServerName waitForMeta(final long timeout)
259 throws InterruptedException, NotAllMetaRegionsOnlineException {
260 ServerName sn = metaRegionTracker.waitMetaRegionLocation(timeout);
261 if (sn == null) {
262 throw new NotAllMetaRegionsOnlineException("Timed out; " + timeout + "ms");
263 }
264 return sn;
265 }
266
267 /**
268 * Gets a connection to the server hosting meta, as reported by ZooKeeper,
269 * waiting up to the specified timeout for availability.
270 * @param timeout How long to wait on meta location
271 * @see #waitForMeta for additional information
272 * @return connection to server hosting meta
273 * @throws InterruptedException
274 * @throws NotAllMetaRegionsOnlineException if timed out waiting
275 * @throws IOException
276 * @deprecated Use #getMetaServerConnection(long)
277 */
278 public AdminService.BlockingInterface waitForMetaServerConnection(long timeout)
279 throws InterruptedException, NotAllMetaRegionsOnlineException, IOException {
280 return getMetaServerConnection(timeout);
281 }
282
283 /**
284 * Gets a connection to the server hosting meta, as reported by ZooKeeper,
285 * waiting up to the specified timeout for availability.
286 * <p>WARNING: Does not retry. Use an {@link HTable} instead.
287 * @param timeout How long to wait on meta location
288 * @see #waitForMeta for additional information
289 * @return connection to server hosting meta
290 * @throws InterruptedException
291 * @throws NotAllMetaRegionsOnlineException if timed out waiting
292 * @throws IOException
293 */
294 AdminService.BlockingInterface getMetaServerConnection(long timeout)
295 throws InterruptedException, NotAllMetaRegionsOnlineException, IOException {
296 return getCachedConnection(waitForMeta(timeout));
297 }
298
299 /**
300 * Waits indefinitely for availability of <code>hbase:meta</code>. Used during
301 * cluster startup. Does not verify meta, just that something has been
302 * set up in zk.
303 * @see #waitForMeta(long)
304 * @throws InterruptedException if interrupted while waiting
305 */
306 public void waitForMeta() throws InterruptedException {
307 Stopwatch stopwatch = new Stopwatch().start();
308 while (!this.stopped) {
309 try {
310 if (waitForMeta(100) != null) break;
311 long sleepTime = stopwatch.elapsedMillis();
312 // +1 in case sleepTime=0
313 if ((sleepTime + 1) % 10000 == 0) {
314 LOG.warn("Have been waiting for meta to be assigned for " + sleepTime + "ms");
315 }
316 } catch (NotAllMetaRegionsOnlineException e) {
317 if (LOG.isTraceEnabled()) {
318 LOG.trace("hbase:meta still not available, sleeping and retrying." +
319 " Reason: " + e.getMessage());
320 }
321 }
322 }
323 }
324
325 /**
326 * @param sn ServerName to get a connection against.
327 * @return The AdminProtocol we got when we connected to <code>sn</code>
328 * May have come from cache, may not be good, may have been setup by this
329 * invocation, or may be null.
330 * @throws IOException
331 */
332 private AdminService.BlockingInterface getCachedConnection(ServerName sn)
333 throws IOException {
334 if (sn == null) {
335 return null;
336 }
337 AdminService.BlockingInterface service = null;
338 try {
339 service = connection.getAdmin(sn);
340 } catch (RetriesExhaustedException e) {
341 if (e.getCause() != null && e.getCause() instanceof ConnectException) {
342 // Catch this; presume it means the cached connection has gone bad.
343 } else {
344 throw e;
345 }
346 } catch (SocketTimeoutException e) {
347 LOG.debug("Timed out connecting to " + sn);
348 } catch (NoRouteToHostException e) {
349 LOG.debug("Connecting to " + sn, e);
350 } catch (SocketException e) {
351 LOG.debug("Exception connecting to " + sn);
352 } catch (UnknownHostException e) {
353 LOG.debug("Unknown host exception connecting to " + sn);
354 } catch (FailedServerException e) {
355 if (LOG.isDebugEnabled()) {
356 LOG.debug("Server " + sn + " is in failed server list.");
357 }
358 } catch (IOException ioe) {
359 Throwable cause = ioe.getCause();
360 if (ioe instanceof ConnectException) {
361 // Catch. Connect refused.
362 } else if (cause != null && cause instanceof EOFException) {
363 // Catch. Other end disconnected us.
364 } else if (cause != null && cause.getMessage() != null &&
365 cause.getMessage().toLowerCase().contains("connection reset")) {
366 // Catch. Connection reset.
367 } else {
368 throw ioe;
369 }
370
371 }
372 return service;
373 }
374
375 /**
376 * Verify we can connect to <code>hostingServer</code> and that its carrying
377 * <code>regionName</code>.
378 * @param hostingServer Interface to the server hosting <code>regionName</code>
379 * @param address The servername that goes with the <code>metaServer</code>
380 * Interface. Used logging.
381 * @param regionName The regionname we are interested in.
382 * @return True if we were able to verify the region located at other side of
383 * the Interface.
384 * @throws IOException
385 */
386 // TODO: We should be able to get the ServerName from the AdminProtocol
387 // rather than have to pass it in. Its made awkward by the fact that the
388 // HRI is likely a proxy against remote server so the getServerName needs
389 // to be fixed to go to a local method or to a cache before we can do this.
390 private boolean verifyRegionLocation(AdminService.BlockingInterface hostingServer,
391 final ServerName address, final byte [] regionName)
392 throws IOException {
393 if (hostingServer == null) {
394 LOG.info("Passed hostingServer is null");
395 return false;
396 }
397 Throwable t = null;
398 try {
399 // Try and get regioninfo from the hosting server.
400 return ProtobufUtil.getRegionInfo(hostingServer, regionName) != null;
401 } catch (ConnectException e) {
402 t = e;
403 } catch (RetriesExhaustedException e) {
404 t = e;
405 } catch (RemoteException e) {
406 IOException ioe = e.unwrapRemoteException();
407 t = ioe;
408 } catch (IOException e) {
409 Throwable cause = e.getCause();
410 if (cause != null && cause instanceof EOFException) {
411 t = cause;
412 } else if (cause != null && cause.getMessage() != null
413 && cause.getMessage().contains("Connection reset")) {
414 t = cause;
415 } else {
416 t = e;
417 }
418 }
419 LOG.info("Failed verification of " + Bytes.toStringBinary(regionName) +
420 " at address=" + address + ", exception=" + t);
421 return false;
422 }
423
424 /**
425 * Verify <code>hbase:meta</code> is deployed and accessible.
426 * @param timeout How long to wait on zk for meta address (passed through to
427 * the internal call to {@link #waitForMetaServerConnection(long)}.
428 * @return True if the <code>hbase:meta</code> location is healthy.
429 * @throws IOException
430 * @throws InterruptedException
431 */
432 public boolean verifyMetaRegionLocation(final long timeout)
433 throws InterruptedException, IOException {
434 AdminService.BlockingInterface service = null;
435 try {
436 service = waitForMetaServerConnection(timeout);
437 } catch (NotAllMetaRegionsOnlineException e) {
438 // Pass
439 } catch (ServerNotRunningYetException e) {
440 // Pass -- remote server is not up so can't be carrying root
441 } catch (UnknownHostException e) {
442 // Pass -- server name doesn't resolve so it can't be assigned anything.
443 } catch (RegionServerStoppedException e) {
444 // Pass -- server name sends us to a server that is dying or already dead.
445 }
446 return (service == null)? false:
447 verifyRegionLocation(service,
448 this.metaRegionTracker.getMetaRegionLocation(), META_REGION_NAME);
449 }
450
451 public HConnection getConnection() {
452 return this.connection;
453 }
454 }